diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index f0190b0df8..88df687bd6 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -33,8 +33,8 @@ jobs: run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info - name: make run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} - - name: make check - run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk check + - name: make test + run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test CPU_MAC: runs-on: macos-latest env: @@ -53,8 +53,8 @@ jobs: run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info - name: make run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} - - name: make check - run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk check + - name: make test + run: make BACKEND=${{ matrix.backend }} OMPFLAGS= FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test GPU: runs-on: self-hosted env: @@ -76,5 +76,5 @@ jobs: run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk info - name: make run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} - - name: make check - run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk check + - name: make test + run: make BACKEND=${{ matrix.backend }} FPTYPE=${{ matrix.precision }} -C ${{ matrix.folder }} -f cudacpp.mk test diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common index 7917d3446a..3cfcc909d9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common @@ -38,7 +38,7 @@ index 617f10b93..00c73099a 100644 +cleanall: cleanSource # THIS IS THE ONE + for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done; diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile -index 348c283be..22517d0cc 100644 +index 348c283be..6a21300f7 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -1,6 +1,37 @@ @@ -79,7 +79,7 @@ index 348c283be..22517d0cc 100644 # Load additional dependencies of the bias module, if present ifeq (,$(wildcard ../bias_dependencies)) BIASDEPENDENCIES = -@@ -24,7 +55,18 @@ else +@@ -24,7 +55,20 @@ else MADLOOP_LIB = endif @@ -87,19 +87,21 @@ index 348c283be..22517d0cc 100644 +LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias + +CUDACPP_MAKEFILE=cudacpp.mk -+CUDACPP_COMMONLIB=mg5amc_common +processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') +ifeq ($(BACKEND),cuda) ++CUDACPP_COMMONLIB=mg5amc_common_cuda +CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda +else ifeq ($(BACKEND),hip) ++CUDACPP_COMMONLIB=mg5amc_common_hip +CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip +else ++CUDACPP_COMMONLIB=mg5amc_common_cpp +CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp +endif LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) -@@ -43,41 +85,145 @@ ifeq ($(strip $(MATRIX_HEL)),) +@@ -43,41 +87,145 @@ ifeq ($(strip $(MATRIX_HEL)),) endif @@ -261,7 +263,7 @@ index 348c283be..22517d0cc 100644 # Dependencies -@@ -97,5 +243,80 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ +@@ -97,5 +245,80 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ run_config.inc initcluster.o: message.inc diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index b175689fa8..e1c330c394 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%_cpp.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%%.o : %%.f *.inc +$(BUILDDIR)/%%_fortran.o : %%.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%%.o : %%.f *.inc +###$(BUILDDIR)/%%_fortran.o : %%.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index b22e9edab6..897de8caa8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%_cpp.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index c36133a43d..f08289bad0 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005721330642700195  +DEBUG: model prefixing takes 0.0057582855224609375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.205 s +ALOHA: aloha creates 3 routines in 0.209 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.263 s +ALOHA: aloha creates 7 routines in 0.272 s FFV1 FFV1 FFV2 @@ -250,9 +250,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.183s -user 0m1.677s -sys 0m0.231s +real 0m1.930s +user 0m1.716s +sys 0m0.204s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 9f78c9e91d..f845f639cc 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005715131759643555  +DEBUG: model prefixing takes 0.00582575798034668  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.004 s +1 processes with 2 diagrams generated in 0.005 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.275 s +ALOHA: aloha creates 4 routines in 0.287 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.667s -user 0m0.613s -sys 0m0.050s +real 0m0.836s +user 0m0.670s +sys 0m0.059s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 58b78d00c9..bcd13f1f43 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005684614181518555  +DEBUG: model prefixing takes 0.005464315414428711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -198,12 +198,12 @@ Wrote files for 10 helas calls in 0.105 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.151 s +ALOHA: aloha creates 2 routines in 0.153 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.137 s +ALOHA: aloha creates 4 routines in 0.142 s VVV1 FFV1 FFV1 @@ -239,9 +239,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.020s -user 0m1.542s -sys 0m0.197s +real 0m1.771s +user 0m1.550s +sys 0m0.221s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index fa79b1b42c..74599408a5 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005660295486450195  +DEBUG: model prefixing takes 0.005803108215332031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -183,7 +183,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.161 s +ALOHA: aloha creates 2 routines in 0.149 s VVV1 FFV1 FFV1 @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.780s -user 0m0.489s -sys 0m0.054s -Code generation completed in 1 seconds +real 0m0.703s +user 0m0.492s +sys 0m0.049s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 3c3ef65898..e0bc256894 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005795478820800781  +DEBUG: model prefixing takes 0.005784273147583008  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.021 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -205,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -220,15 +220,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.253 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s +Wrote files for 46 helas calls in 0.279 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.344 s +ALOHA: aloha creates 5 routines in 0.338 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.328 s +ALOHA: aloha creates 10 routines in 0.323 s VVV1 VVV1 FFV1 @@ -285,9 +285,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.376s -user 0m2.126s -sys 0m0.249s +real 0m2.389s +user 0m2.141s +sys 0m0.224s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index d2f40f6c13..d226034616 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005821704864501953  +DEBUG: model prefixing takes 0.005830287933349609  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.158 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s +Wrote files for 36 helas calls in 0.156 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.342 s +ALOHA: aloha creates 5 routines in 0.344 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.323 s +ALOHA: aloha creates 10 routines in 0.328 s VVV1 VVV1 FFV1 @@ -254,9 +254,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.436s -user 0m2.028s -sys 0m0.221s +real 0m2.256s +user 0m2.049s +sys 0m0.205s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index de84b379b4..e94069458a 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005735158920288086  +DEBUG: model prefixing takes 0.005761384963989258  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.339 s +ALOHA: aloha creates 5 routines in 0.336 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.163s -user 0m0.744s +real 0m0.797s +user 0m0.737s sys 0m0.056s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index b30b3f9586..2ce03bcd21 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005646944046020508  +DEBUG: model prefixing takes 0.005746126174926758  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.164 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.449 s -Wrote files for 222 helas calls in 0.721 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.452 s +Wrote files for 222 helas calls in 0.720 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 0.343 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -257,10 +257,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.460s -user 0m3.145s -sys 0m0.235s -Code generation completed in 3 seconds +real 0m3.664s +user 0m3.149s +sys 0m0.231s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 91c34f2ddf..88a6d07d2d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.334 s +ALOHA: aloha creates 5 routines in 0.333 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.548s -user 0m1.422s -sys 0m0.066s +real 0m1.630s +user 0m1.428s +sys 0m0.058s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ebb3d7f3f9..2588d0cd2e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005715847015380859  +DEBUG: model prefixing takes 0.005673885345458984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.959 s +1 processes with 1240 diagrams generated in 1.945 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.869 s -Wrote files for 2281 helas calls in 19.196 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.855 s +Wrote files for 2281 helas calls in 19.401 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.329 s +ALOHA: aloha creates 5 routines in 0.330 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.323 s +ALOHA: aloha creates 10 routines in 0.322 s VVV1 VVV1 FFV1 @@ -259,9 +259,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m30.424s -user 0m29.841s -sys 0m0.390s +real 0m30.499s +user 0m29.983s +sys 0m0.419s Code generation completed in 31 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 102c039911..6b5074a2c1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005713939666748047  +DEBUG: model prefixing takes 0.005708456039428711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.956 s +1 processes with 1240 diagrams generated in 1.997 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.867 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.860 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.368 s +ALOHA: aloha creates 5 routines in 0.365 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m14.024s -user 0m13.379s -sys 0m0.108s +real 0m13.606s +user 0m13.435s +sys 0m0.117s Code generation completed in 14 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 39a6a70bdb..60a7e04bfe 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005742073059082031  +DEBUG: model prefixing takes 0.005821704864501953  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.081 s +8 processes with 40 diagrams generated in 0.083 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -218,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -233,17 +233,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.228 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s +Wrote files for 32 helas calls in 0.236 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.156 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.139 s +ALOHA: aloha creates 4 routines in 0.141 s FFV1 FFV1 FFV1 @@ -296,9 +296,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.997s -user 0m1.767s -sys 0m0.219s +real 0m2.028s +user 0m1.809s +sys 0m0.218s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 87d22b379f..c9b53c9d92 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0056705474853515625  +DEBUG: model prefixing takes 0.0056972503662109375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -215,7 +215,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.149 s +ALOHA: aloha creates 2 routines in 0.151 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.666s -user 0m0.617s -sys 0m0.044s +real 0m0.671s +user 0m0.612s +sys 0m0.054s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index 451b95c9d8..703d24d998 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -150,7 +150,7 @@ INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Creating files in directory P1_gg_bbx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -166,20 +166,20 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s -Wrote files for 12 helas calls in 0.109 s +Wrote files for 12 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.272 s +ALOHA: aloha creates 4 routines in 0.277 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.255 s +ALOHA: aloha creates 8 routines in 0.262 s VVS3 VVV1 FFV1 @@ -217,9 +217,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.056s -user 0m1.780s -sys 0m0.210s +real 0m2.061s +user 0m1.786s +sys 0m0.232s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_src.mk index 8ab23b4e83..73e8ad5d8c 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_heft_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_heft.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 6fcf07675b..599dc14f9e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -66,7 +66,7 @@ INFO: load particles INFO: load vertices WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.005918025970458984  +DEBUG: model prefixing takes 0.0062215328216552734  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -162,7 +162,7 @@ ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.273 s +ALOHA: aloha creates 4 routines in 0.274 s VVS3 VVV1 FFV1 @@ -179,7 +179,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.707s -user 0m0.609s -sys 0m0.063s +real 0m0.684s +user 0m0.625s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_src.mk index 8ab23b4e83..73e8ad5d8c 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_heft_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_heft.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 5a9d08cd3b..cc5cbed2bb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757570266723633  +DEBUG: model prefixing takes 0.005604982376098633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.143 s +13 processes with 76 diagrams generated in 0.142 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.919 s +65 processes with 1119 diagrams generated in 1.906 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -517,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -534,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -551,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -568,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -585,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -602,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -619,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -636,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -653,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -670,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -687,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -704,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -721,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -738,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -755,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -772,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -789,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.344 s -Wrote files for 810 helas calls in 3.818 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.352 s +Wrote files for 810 helas calls in 3.380 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.352 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.324 s +ALOHA: aloha creates 10 routines in 0.325 s VVV1 VVV1 FFV1 @@ -1030,9 +1030,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.573s -user 0m8.691s -sys 0m0.428s +real 0m9.708s +user 0m8.687s +sys 0m0.423s Code generation completed in 10 seconds ************************************************************ * * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index 655d87b749..a66801a71e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_sm.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index fc3bde1247..70288a15e0 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.1441042423248291  +DEBUG: model prefixing takes 0.14343023300170898  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.863 s +1 processes with 72 diagrams generated in 3.871 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 INFO: Creating files in directory P1_gg_ttxttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -130,15 +130,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -Generated helas calls for 1 subprocesses (72 diagrams) in 0.199 s -Wrote files for 119 helas calls in 0.434 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.195 s +Wrote files for 119 helas calls in 0.431 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.328 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -146,7 +146,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.340 s +ALOHA: aloha creates 10 routines in 0.342 s VVV5 VVV5 FFV1 @@ -191,9 +191,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m7.076s -user 0m6.791s -sys 0m0.267s +real 0m7.071s +user 0m6.803s +sys 0m0.249s Code generation completed in 7 seconds ************************************************************ * * diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_src.mk b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_src.mk index 2a3a9f3698..995525c5ca 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 8a4609634c..ad5f437053 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -77,7 +77,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.14406657218933105  +DEBUG: model prefixing takes 0.14435124397277832  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -92,7 +92,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 3.846 s +1 processes with 72 diagrams generated in 3.831 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Load PLUGIN.CUDACPP_OUTPUT @@ -115,7 +115,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.195 s +Generated helas calls for 1 subprocesses (72 diagrams) in 0.193 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines @@ -123,7 +123,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.335 s +ALOHA: aloha creates 5 routines in 0.327 s VVV5 VVV5 FFV1 @@ -143,7 +143,7 @@ INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SME INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m5.291s -user 0m5.196s -sys 0m0.076s -Code generation completed in 5 seconds +real 0m5.274s +user 0m5.174s +sys 0m0.072s +Code generation completed in 6 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk index 2a3a9f3698..995525c5ca 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 67e6cba085..2b28aa829b 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.126 s +1 processes with 6 diagrams generated in 0.131 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 INFO: Creating files in directory P1_gg_t1t1x DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -592,19 +592,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s -Wrote files for 16 helas calls in 0.114 s +Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s +Wrote files for 16 helas calls in 0.117 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.193 s +ALOHA: aloha creates 3 routines in 0.195 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.188 s +ALOHA: aloha creates 6 routines in 0.192 s VVV1 VSS1 VSS1 @@ -645,9 +645,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.868s -user 0m2.619s -sys 0m0.249s +real 0m2.918s +user 0m2.669s +sys 0m0.245s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_src.mk index 74635706c1..b33b4ff2ad 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 9cf89e16b5..3531f3a8c6 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.127 s +1 processes with 6 diagrams generated in 0.130 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Load PLUGIN.CUDACPP_OUTPUT @@ -583,7 +583,7 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.191 s +ALOHA: aloha creates 3 routines in 0.194 s VVV1 VSS1 VSS1 @@ -599,7 +599,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.380s -user 0m1.311s -sys 0m0.062s -Code generation completed in 2 seconds +real 0m1.427s +user 0m1.344s +sys 0m0.074s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_src.mk index 74635706c1..b33b4ff2ad 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 0fd4004266..bfb96cff2a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.122 s +1 processes with 3 diagrams generated in 0.126 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -577,7 +577,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1148]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -593,16 +593,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.107 s +Wrote files for 10 helas calls in 0.110 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.141 s +ALOHA: aloha creates 2 routines in 0.145 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.137 s +ALOHA: aloha creates 4 routines in 0.141 s VVV1 FFV1 FFV1 @@ -638,9 +638,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.736s -user 0m2.505s -sys 0m0.229s +real 0m2.857s +user 0m2.538s +sys 0m0.270s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile index 22517d0cc6..6a21300f7a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/makefile @@ -58,13 +58,15 @@ endif LINKLIBS = $(LINK_MADLOOP_LIB) $(LINK_LOOP_LIBS) -L$(LIBDIR) -ldhelas -ldsample -lmodel -lgeneric -lpdf -lcernlib $(llhapdf) -lbias CUDACPP_MAKEFILE=cudacpp.mk -CUDACPP_COMMONLIB=mg5amc_common processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ifeq ($(BACKEND),cuda) +CUDACPP_COMMONLIB=mg5amc_common_cuda CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cuda else ifeq ($(BACKEND),hip) +CUDACPP_COMMONLIB=mg5amc_common_hip CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_hip else +CUDACPP_COMMONLIB=mg5amc_common_cpp CUDACPP_BACKENDLIB=mg5amc_$(processid_short)_cpp endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_src.mk index 74635706c1..b33b4ff2ad 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index eb1cace7ab..09a04f791a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F import model MSSM_SLHA2 INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.9385032653808594  +DEBUG: model prefixing takes 0.9576148986816406  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -557,7 +557,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.112 s +1 processes with 3 diagrams generated in 0.117 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -585,7 +585,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.145 s VVV1 FFV1 FFV1 @@ -600,7 +600,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.443s -user 0m2.351s -sys 0m0.073s -Code generation completed in 2 seconds +real 0m2.489s +user 0m2.409s +sys 0m0.070s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 50ce6bb9c8..89da34009a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -54,65 +54,6 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) - override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) - override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) - override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else - override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) - -# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) - TESTDIR = -else ifneq ($(LOCALGTEST),) - TESTDIR=$(TESTDIRLOCAL) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) - TESTDIR = $(TESTDIRCOMMON) - GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else - TESTDIR = -endif -ifneq ($(GTEST_ROOT),) - GTESTLIBDIR = $(GTEST_ROOT)/lib64/ - GTESTLIBS = $(GTESTLIBDIR)/libgtest.a - GTESTINC = -I$(GTEST_ROOT)/include -else - GTESTLIBDIR = - GTESTLIBS = - GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - #=== Redefine BACKEND if the current value is 'cppauto' # Set the default BACKEND choice corresponding to 'cppauto' (the 'best' C++ vectorization available: eventually use native instead?) @@ -340,6 +281,69 @@ endif #------------------------------------------------------------------------------- +#=== Configure common compiler flags for C++ and CUDA/HIP + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) + override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) + override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) + override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else + override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) + +# Export CXXNAMESUFFIX (so that there is no need to check/define it again in cudacpp_test.mk) +export CXXNAMESUFFIX + +# Dependency on test directory +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) + TESTDIR = +else ifneq ($(LOCALGTEST),) + TESTDIR=$(TESTDIRLOCAL) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) + TESTDIR = $(TESTDIRCOMMON) + GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else + TESTDIR = +endif +ifneq ($(GTEST_ROOT),) + GTESTLIBDIR = $(GTEST_ROOT)/lib64/ + GTESTLIBS = $(GTESTLIBDIR)/libgtest.a + GTESTINC = -I$(GTEST_ROOT)/include +else + GTESTLIBDIR = + GTESTLIBS = + GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) + +#------------------------------------------------------------------------------- + #=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) @@ -606,27 +610,25 @@ override RUNTIME = #=== Makefile TARGETS and build rules below #=============================================================================== -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) - gpu_main=$(BUILDDIR)/gcheck.exe - fgpu_main=$(BUILDDIR)/fgcheck.exe +ifeq ($(GPUCC),) + cxx_checkmain=$(BUILDDIR)/check_cpp.exe + cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe + cxx_testmain=$(BUILDDIR)/runTest_cpp.exe else - gpu_main= - fgpu_main= + gpu_checkmain=$(BUILDDIR)/check_$(GPUSUFFIX).exe + gpu_fcheckmain=$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe + gpu_testmain=$(BUILDDIR)/runTest_$(GPUSUFFIX).exe endif -testmain=$(BUILDDIR)/runTest.exe - # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) # First target (default goal) -ifneq ($(GPUCC),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_main) $(fgpu_main) $(if $(GTESTLIBS),$(testmain)) +ifeq ($(GPUCC),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_checkmain) $(cxx_fcheckmain) $(if $(GTESTLIBS),$(cxx_testmain)) else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_main) $(fcxx_main) $(if $(GTESTLIBS),$(testmain)) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_checkmain) $(gpu_fcheckmain) $(if $(GTESTLIBS),$(gpu_testmain)) endif # Target (and build options): debug @@ -643,30 +645,30 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Apply special build flags only to CrossSectionKernel[_$(GPUSUFFIX)].o (no fast math, see #117 and #516) +# Apply special build flags only to CrossSectionKernel_.o (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) +$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -fno-fast-math endif -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) +# Apply special build flags only to check_sa_.o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: CXXFLAGS += $(USE_NVTX) $(CUDA_INC) -# Apply special build flags only to check_sa[_$(GPUSUFFIX)].o and (Cu|Hip)randRandomNumberKernel[_$(GPUSUFFIX)].o -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +# Apply special build flags only to check_sa_.o and (Cu|Hip)randRandomNumberKernel_.o +$(BUILDDIR)/check_sa_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/check_sa_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(RNDCXXFLAGS) $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o: GPUFLAGS += $(RNDCXXFLAGS) ifeq ($(HASCURAND),hasCurand) # curand headers, #679 -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUDA_INC) +$(BUILDDIR)/CurandRandomNumberKernel_cpp.o: CXXFLAGS += $(CUDA_INC) endif ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers -$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIP_INC) +$(BUILDDIR)/HiprandRandomNumberKernel_cpp.o: CXXFLAGS += $(HIP_INC) endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) @@ -679,21 +681,21 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_cpp.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option +###$(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o: GPUFLAGS += $(XCOMPILERFLAG) -Wno-overriding-t-option ###endif ###endif #### Apply special build flags only to CPPProcess.o (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += -flto #### Apply special build flags only to CPPProcess.o (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) +###$(BUILDDIR)/CPPProcess_cpp.o: CXXFLAGS += $(AVXFLAGS) # Generic target and build rules: objects from C++ compilation # (NB do not include CUDA_INC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -718,8 +720,8 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) @@ -728,8 +730,8 @@ gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/ endif # Target (and build rules): C++ and CUDA/HIP shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) @@ -756,63 +758,63 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): C++ and CUDA/HIP standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) +$(cxx_checkmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_checkmain): $(BUILDDIR)/check_sa_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o + $(CXX) -o $@ $(BUILDDIR)/check_sa_cpp.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cpp.o $(BUILDDIR)/HiprandRandomNumberKernel_cpp.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(gpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(gpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_checkmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_checkmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(gpu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(gpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(gpu_main): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o +$(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%_fortran.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%_fortran.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ # Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc +###$(BUILDDIR)/fcheck_sa_fortran.o : $(INCDIR)/fbridge.inc ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(cxx_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +$(cxx_fcheckmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_cpp.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ else - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(OMPFLAGS) $(BUILDDIR)/fsampler_cpp.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fgpu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fgpu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_fcheckmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_fcheckmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) -$(fgpu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +$(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif -$(fgpu_main): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fgpu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) +$(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) endif endif @@ -820,83 +822,94 @@ endif # Target (and build rules): test objects and test executable ifeq ($(GPUCC),) -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions +$(BUILDDIR)/testxxx_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cpp.o: INCFLAGS += $(GTESTINC) +$(BUILDDIR)/testxxx_cpp.o: testxxx_cc_ref.txt +$(cxx_testmain): $(BUILDDIR)/testxxx_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testxxx_cpp.o # Comment out this line to skip the C++ test of xxx functions else $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_$(GPUSUFFIX).o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions +$(gpu_testmain): $(BUILDDIR)/testxxx_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testxxx_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP test of xxx functions endif ifneq ($(UNAME_S),Darwin) # Disable testmisc on Darwin (workaround for issue #838) ifeq ($(GPUCC),) -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests +$(BUILDDIR)/testmisc_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/testmisc_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/testmisc_cpp.o # Comment out this line to skip the C++ miscellaneous tests else $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/testmisc_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests +$(gpu_testmain): $(BUILDDIR)/testmisc_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/testmisc_$(GPUSUFFIX).o # Comment out this line to skip the CUDA/HIP miscellaneous tests endif endif ifeq ($(GPUCC),) -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o +$(BUILDDIR)/runTest_cpp.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cpp.o: INCFLAGS += $(GTESTINC) +$(cxx_testmain): $(BUILDDIR)/runTest_cpp.o +$(cxx_testmain): cxx_objects_exe += $(BUILDDIR)/runTest_cpp.o else $(BUILDDIR)/runTest_$(GPUSUFFIX).o: $(GTESTLIBS) $(BUILDDIR)/runTest_$(GPUSUFFIX).o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(gpu_testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(gpu_testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(gpu_testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif -$(testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o -$(testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): $(BUILDDIR)/runTest_$(GPUSUFFIX).o +$(gpu_testmain): gpu_objects_exe += $(BUILDDIR)/runTest_$(GPUSUFFIX).o endif -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -###$(testmain): LIBFLAGS += -lgtest_main # no longer necessary since we added main() to testxxx.cc +ifeq ($(GPUCC),) +$(cxx_testmain): $(GTESTLIBS) +$(cxx_testmain): INCFLAGS += $(GTESTINC) +$(cxx_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +else +$(gpu_testmain): $(GTESTLIBS) +$(gpu_testmain): INCFLAGS += $(GTESTINC) +$(gpu_testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest # adding also -lgtest_main is no longer necessary since we added main() to testxxx.cc +endif +ifeq ($(GPUCC),) # if at all, OMP is used only in CXX builds (not in GPU builds) ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +$(cxx_testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +$(cxx_testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 ###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +###$(cxx_testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) else -$(testmain): LIBFLAGS += -lgomp +$(cxx_testmain): LIBFLAGS += -lgomp +endif endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -###$(testmain): LIBFLAGS += -lquadmath +###ifeq ($(GPUCC),) +###$(cxx_testmain): LIBFLAGS += -lquadmath +###else +###$(gpu_testmain): LIBFLAGS += -lquadmath +###endif # Bypass std::filesystem completely to ease portability on LUMI #803 -#ifneq ($(findstring hipcc,$(GPUCC)),) -#$(testmain): LIBFLAGS += -lstdc++fs -#endif +###ifneq ($(findstring hipcc,$(GPUCC)),) +###$(gpu_testmain): LIBFLAGS += -lstdc++fs +###endif -ifeq ($(GPUCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) +ifeq ($(GPUCC),) # link only runTest_cpp.o +$(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest.o and runTest_$(GPUSUFFIX).o) -$(testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) +else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o) +$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 $(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else @@ -1051,51 +1064,50 @@ endif #------------------------------------------------------------------------------- -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -# [FIXME: SHOULD CHANGE THE TARGET NAME "check" THAT HAS NOTHING TO DO WITH "check.exe"] -ifneq ($(GPUCC),) -check: runTest cmpFGcheck -else -check: runTest cmpFcheck -endif +# Target: 'make test' (execute runTest.exe, and compare check.exe with fcheck.exe) +# [NB: THIS IS WHAT IS TESTED IN THE GITHUB CI!] +# [NB: This used to be called 'make check' but the name has been changed as this has nothing to do with 'check.exe'] +test: runTest cmpFcheck -# Target: runTest (run the C++ test executable runTest.exe) +# Target: runTest (run the C++ or CUDA/HIP test executable runTest.exe) runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/runTest_cpp.exe +else + $(RUNTIME) $(BUILDDIR)/runTest_$(GPUSUFFIX).exe +endif -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +# Target: runCheck (run the C++ or CUDA/HIP standalone executable check.exe, with a small number of events) runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA/HIP standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/check_cpp.exe -p 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 +endif -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +# Target: runFcheck (run the Fortran standalone executable - with C++ or CUDA/HIP MEs - fcheck.exe, with a small number of events) runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA/HIP MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 +ifeq ($(GPUCC),) + $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 +else + $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 +endif -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +# Target: cmpFcheck (compare ME results from the C++/CUDA/HIP and Fortran with C++/CUDA/HIP MEs standalone executables, with a small number of events) cmpFcheck: all.$(TAG) @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA/HIP and Fortran with CUDA/HIP MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +ifeq ($(GPUCC),) + @echo "$(BUILDDIR)/check_cpp.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_cpp.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_cpp.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_cpp.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +else + @echo "$(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check_$(GPUSUFFIX).exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck_$(GPUSUFFIX).exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/GPU) = $${me1}"; echo "Avg ME (F77/GPU) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/GPU) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi +endif # Target: cuda-memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) cuda-memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/check_$(GPUSUFFIX).exe -p 2 32 2 #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk index 74635706c1..b33b4ff2ad 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk @@ -98,7 +98,11 @@ endif # NB1: there are no CUDA targets in src as we avoid RDC! # NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! -MG5AMC_COMMONLIB = mg5amc_common +ifeq ($(GPUCC),) +MG5AMC_COMMONLIB = mg5amc_common_cpp +else +MG5AMC_COMMONLIB = mg5amc_common_$(GPUSUFFIX) +endif # Explicitly define the default goal (this is not necessary as it is the first target, which is implicitly the default goal) .DEFAULT_GOAL := all.$(TAG) @@ -126,7 +130,7 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cpp.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,22 +143,22 @@ endif #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, read_slha.o) -ifneq ($(GPUCC),) - gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) +cxx_objects=$(addprefix $(BUILDDIR)/, read_slha_cpp.o) +ifeq ($(GPUCC),) + cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_cpp.o) else - cxx_objects+=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o) + gpu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_$(GPUSUFFIX).o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) - @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) -else +ifeq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +else +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(gpu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(gpu_objects) $(LDFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/tmad/allTees.sh b/epochX/cudacpp/tmad/allTees.sh index 6e0d1931a6..a9479018f9 100755 --- a/epochX/cudacpp/tmad/allTees.sh +++ b/epochX/cudacpp/tmad/allTees.sh @@ -17,10 +17,10 @@ add10x="+10x" while [ "$1" != "" ]; do if [ "$1" == "-short" ]; then - short=1 # all but ggttggg + short=1 # all (possibly including bsm) but ggttggg shift elif [ "$1" == "-ggttggg" ]; then - short=-1 # only ggttggg + short=-1 # only ggttggg (implies no bsm!) shift elif [ "$1" == "-makeclean" ]; then makeclean=$1 @@ -55,7 +55,9 @@ status=$? ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]" if [ "${bsm}" != "-nobsm" ]; then - ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x + if [ "$short" != "-1" ]; then + ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x + fi fi status=$? ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]" diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index bf4ac49169..41d66d8253 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-05-15_17:55:49 +DATE: 2024-05-16_01:25:56 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5230s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0062s for 8192 events => throughput is 1.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1340s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1281s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1771s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.54E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2142s - [COUNTERS] Fortran MEs ( 1 ) : 0.0627s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3949s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3024s + [COUNTERS] Fortran MEs ( 1 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1473s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1913s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2860s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3835s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 90112 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.405897e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.192081e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425037e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.204613e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1351s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1822s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.95E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661518E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,8 +233,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0390s for 90112 events => throughput is 2.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3602s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3125s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -255,14 +255,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392866e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.953763e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.439421e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.009226e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,8 +276,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -285,13 +285,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1328s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1303s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.29E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1833s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1802s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2409s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 90112 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3431s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 90112 events => throughput is 2.46E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.466511e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541984e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.546981e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.616899e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1810s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3042s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0346s for 90112 events => throughput is 2.61E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.662866e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.883371e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +428,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4136s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.75E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.1859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1818s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.99E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and hip (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +461,143 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3474s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0430s for 90112 events => throughput is 2.10E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.029340e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.231218e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.6140s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cuda (9.3382715404661532E-002) differ by less than 3E-14 (0.0) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.4962s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4918s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 2.02E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7363s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.86E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and hip (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cuda (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.173800e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.277665e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.535624e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.916168e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.258734e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.959957e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.870473e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.493136e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.239388e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.970202e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.963945e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.040191e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.186996e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.002261e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.568736e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.140061e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 1284773939..c4c8099bbf 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-05-15_17:56:01 +DATE: 2024-05-16_01:26:13 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5038s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7287s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7200s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.42E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1783s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.57E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2776s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2152s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2987s + [COUNTERS] Fortran MEs ( 1 ) : 0.0916s for 90112 events => throughput is 9.84E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382701684199335E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1330s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1903s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1836s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.22E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382701684199335E-002) differ by less than 4E-4 (1.4692721372888684e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382703205998396E-002) differ by less than 4E-4 (1.306308462512007e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515588842633111E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.4155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3585s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0570s for 90112 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3825s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3093s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515588842633111E-002) differ by less than 4E-4 (1.439903947186849e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002) differ by less than 4E-4 (1.2999352305698153e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649401e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.260929e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.656662e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.250210e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719831741665E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1322s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.87E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.15E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719831741665E-002) differ by less than 4E-4 (4.740791825774693e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700723828302E-002) differ by less than 4E-4 (1.5721146218172777e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606481761602E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2364s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0232s for 90112 events => throughput is 3.88E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2997s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0290s for 90112 events => throughput is 3.10E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606481761602E-002) differ by less than 4E-4 (4.875410031246474e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002) differ by less than 4E-4 (1.5742791048545257e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.040179e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.206836e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.136369e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.334282e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719700521907E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1328s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1835s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719700521907E-002) differ by less than 4E-4 (4.6002735842876064e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606480805645E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0196s for 90112 events => throughput is 4.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3304s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606480805645E-002) differ by less than 4E-4 (4.874365444607065e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.888730e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.522447e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.386931e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1829s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.41E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3301s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3033s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0268s for 90112 events => throughput is 3.37E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.528072e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.024990e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.720927e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1844s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.21E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704335459282E-002) differ by less than 4E-4 (1.1853587900123586e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3371s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3080s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0291s for 90112 events => throughput is 3.10E+06 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002) differ by less than 4E-4 (1.1717945325173673e-07) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.341186e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.598530e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704338101225E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4096s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6090s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6085s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.68E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and hip (9.3382704338101225E-002) differ by less than 4E-4 (1.1850758729892164e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cuda (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591361999701E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5245s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5216s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 90112 events => throughput is 3.12E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7297s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.95E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and hip (9.1515591361999701E-002) differ by less than 4E-4 (1.1646102771045719e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cuda (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.738506e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.546893e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.062982e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.804903e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.296230e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.477327e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.605708e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.060127e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.343067e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.389797e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.717215e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.251129e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103812e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.752691e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.678644e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.481445e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 8615124c0e..fc86f120db 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -make USEBUILDDIR=1 BACKEND=hip -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-05-15_17:56:12 +DATE: 2024-05-16_01:26:29 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4963s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7237s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7153s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.74E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1290s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1861s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1773s + [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.40E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2788s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2161s - [COUNTERS] Fortran MEs ( 1 ) : 0.0627s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4045s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3124s + [COUNTERS] Fortran MEs ( 1 ) : 0.0920s for 90112 events => throughput is 9.79E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1396s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1336s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1895s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.14E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701395E-002) differ by less than 2E-4 (1.7176482458580722e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,8 +157,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3295s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0658s for 90112 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3901s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3110s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 90112 events => throughput is 1.14E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -179,14 +179,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.402038e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.191141e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425726e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.200935e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,8 +200,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -209,13 +209,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1337s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1302s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,8 +233,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2536s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2157s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3486s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3030s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0456s for 90112 events => throughput is 1.98E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -255,14 +255,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.462735e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.010123e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.509349e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.071657e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,8 +276,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -285,13 +285,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1318s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1292s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484913930753692e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,8 +309,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2462s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2179s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3402s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3039s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 90112 events => throughput is 2.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -331,22 +331,92 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.347503e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.462700e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.432398e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.639506e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1850s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3431s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 90112 events => throughput is 2.56E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.637002e+06 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.800572e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +428,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009222E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.78E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.1865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1828s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.21E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and hip (9.3382715392009222E-002) differ by less than 2E-4 (1.3548862032308762e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +461,143 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3487s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3069s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0418s for 90112 events => throughput is 2.16E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.197743e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.278261e+06 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.6095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cuda (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5106s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.07E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7369s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0050s for 90112 events => throughput is 1.81E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and hip (9.1515602021089631E-002) differ by less than 2E-4 (1.1898038110302878e-11) +OK! xsec from fortran (9.1515602020000766E-002) and cuda (9.1515602021089631E-002) differ by less than 2E-4 (1.1898038110302878e-11) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.170122e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.356139e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.560446e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.953546e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.335116e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.960740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.884712e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.522141e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.310770e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.009432e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.943226e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.090602e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.179111e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.972046e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.566543e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.157381e+08 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b04658a765..e1be7813b6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_17:56:24 +DATE: 2024-05-16_01:26:45 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 2.5934s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5646s - [COUNTERS] Fortran MEs ( 1 ) : 0.0288s for 8192 events => throughput is 2.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8221s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7787s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5662s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4146s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3704s + [COUNTERS] Fortran MEs ( 1 ) : 0.0442s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3563s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0428s - [COUNTERS] Fortran MEs ( 1 ) : 0.3135s for 90112 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7473s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2714s + [COUNTERS] Fortran MEs ( 1 ) : 0.4758s for 90112 events => throughput is 1.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3342s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4516s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0396s for 8192 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4118s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0583s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3535s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8025s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3670s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4355s for 90112 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.609025e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.132783e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.618885e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.139840e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756619] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3247s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3068s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4276s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4032s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756619) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989085] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2449s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1974s for 90112 events => throughput is 4.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6069s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2528s for 90112 events => throughput is 3.56E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989085) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.631092e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.613591e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.635903e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.632391e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2996s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3994s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3855s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0139s for 8192 events => throughput is 5.89E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1527s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0393s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1135s for 90112 events => throughput is 7.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5056s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3497s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1559s for 90112 events => throughput is 5.78E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989114) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.251097e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.889737e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.307881e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.921310e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3995s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3870s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4813s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3413s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1399s for 90112 events => throughput is 6.44E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.578026e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.673606e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3991s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.5933s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3549s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2384s for 90112 events => throughput is 3.78E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.816986e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.814285e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,8 +504,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -367,19 +513,19 @@ Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_c [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5722s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5714s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.10E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8124s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8118s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.42E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and hip (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cuda (47.094184803756640) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,8 +537,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -400,56 +546,58 @@ Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x10_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3209s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3132s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7642s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7574s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 90112 events => throughput is 1.34E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and hip (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cuda (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.313283e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.120396e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008251e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.622859e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.760333e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.177398e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.753176e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.080565e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.804807e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.172657e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.946696e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.155839e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.747216e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.173872e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149230e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.068966e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 8394c1f832..0b367d2d96 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_17:56:47 +DATE: 2024-05-16_01:27:12 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5837s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5552s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7751s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s - [COUNTERS] Fortran MEs ( 1 ) : 0.0287s for 8192 events => throughput is 2.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s + [COUNTERS] Fortran MEs ( 1 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3378s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0255s - [COUNTERS] Fortran MEs ( 1 ) : 0.3123s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7478s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2691s + [COUNTERS] Fortran MEs ( 1 ) : 0.4787s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178241446492] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3446s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4108s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 8192 events => throughput is 2.23E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094178241446492) differ by less than 4E-4 (1.3934438314322506e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094179780921394) differ by less than 4E-4 (1.0665510541407741e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105686930681671] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3548s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3035s for 90112 events => throughput is 2.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7717s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4058s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105686930681671) differ by less than 4E-4 (1.7724624157278157e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ by less than 4E-4 (1.4224799227413598e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.087443e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.257844e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.095881e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.292052e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176373190514] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0129s for 8192 events => throughput is 6.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4035s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0157s for 8192 events => throughput is 5.23E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094176373190514) differ by less than 4E-4 (1.7901501314643298e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094175850060040) differ by less than 4E-4 (1.9012318908107062e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105685173093654] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2445s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1030s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1416s for 90112 events => throughput is 6.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5166s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3428s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1737s for 90112 events => throughput is 5.19E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105685173093654) differ by less than 4E-4 (2.1455782361901043e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ by less than 4E-4 (2.2324275217311396e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.456336e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.218996e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.800046e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.263655e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094174474272364] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0063s for 8192 events => throughput is 1.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094174474272364) differ by less than 4E-4 (2.1933672500473733e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684585116684] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1025s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0687s for 90112 events => throughput is 1.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4300s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684585116684) differ by less than 4E-4 (2.2703990176786704e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.370612e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.896073e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.940492e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3898s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3312s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0856s for 90112 events => throughput is 1.05E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.032975e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.381931e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.050779e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3948s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0111s for 8192 events => throughput is 7.41E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094178213275804) differ by less than 4E-4 (1.3994256109484127e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4646s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3397s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1249s for 90112 events => throughput is 7.22E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ by less than 4E-4 (1.4588574703822133e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.304914e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.408593e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176770070867] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5844s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.10E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8097s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8091s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and hip (47.094176770070867) differ by less than 4E-4 (1.705876382374072e-07) +OK! xsec from fortran (47.094184803756640) and cuda (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105687115703695] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3211s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3174s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 90112 events => throughput is 2.45E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7654s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7594s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 90112 events => throughput is 1.51E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and hip (47.105687115703695) differ by less than 4E-4 (1.733184357144424e-07) +OK! xsec from fortran (47.105695279989114) and cuda (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.990316e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.397168e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.129345e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.912682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.057346e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.099083e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.981637e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.785250e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.063827e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.065057e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.077523e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.885291e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.183964e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.649544e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.370435e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.436840e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 5052fa8251..197f6200da 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_17:57:05 +DATE: 2024-05-16_01:27:37 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5579s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8243s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7810s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3179s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2893s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4109s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3676s + [COUNTERS] Fortran MEs ( 1 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,8 +99,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3405s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0280s - [COUNTERS] Fortran MEs ( 1 ) : 0.3124s for 90112 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7536s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2749s + [COUNTERS] Fortran MEs ( 1 ) : 0.4787s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,8 +124,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -133,13 +133,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3542s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3211s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428942997143e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,8 +157,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5155s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1517s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8139s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3740s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4399s for 90112 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -179,14 +179,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531708e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.079375e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.538603e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.079476e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863908] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 8192 events => throughput is 4.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0224s for 8192 events => throughput is 3.67E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863908) differ by less than 2E-4 (2.8413429165041748e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,8 +233,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2418s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0452s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1966s for 90112 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5989s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3477s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2513s for 90112 events => throughput is 3.59E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -255,14 +255,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.681520e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.535235e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.519243e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.688874e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208834] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3088s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2987s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 8192 events => throughput is 8.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0139s for 8192 events => throughput is 5.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186193208834) differ by less than 2E-4 (2.9503689491505725e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630852] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1461s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0355s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1106s for 90112 events => throughput is 8.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5000s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1530s for 90112 events => throughput is 5.89E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696667630852) differ by less than 2E-4 (2.9458046002517335e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.429153e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.781843e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.506856e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.986506e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.4023s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4677s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3316s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1361s for 90112 events => throughput is 6.62E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.671424e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.782154e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +428,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437837] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5745s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5738s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.17E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3944s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.98E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and hip (47.094184798437837) differ by less than 2E-4 (1.1293943558143837e-10) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,8 +461,84 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.5820s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3527s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2292s for 90112 events => throughput is 3.93E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.998616e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.990048e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.8154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8148s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cuda (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -400,56 +546,58 @@ Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggtt_x10_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3197s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3120s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7703s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7635s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 90112 events => throughput is 1.32E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and hip (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) +OK! xsec from fortran (47.105695279989114) and cuda (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.359184e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.143723e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.029386e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.636090e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.791869e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.182886e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800661e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066867e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.806095e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.180722e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.026202e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.146460e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.778718e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.158610e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.169422e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.030823e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 46571ae40b..7f0ff41464 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-05-15_17:57:22 +DATE: 2024-05-16_01:28:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.5675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3642s - [COUNTERS] Fortran MEs ( 1 ) : 0.2033s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3669s + [COUNTERS] Fortran MEs ( 1 ) : 0.3350s for 8192 events => throughput is 2.45E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4587s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2556s - [COUNTERS] Fortran MEs ( 1 ) : 0.2031s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3273s + [COUNTERS] Fortran MEs ( 1 ) : 0.3354s for 8192 events => throughput is 2.44E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4242s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1958s - [COUNTERS] Fortran MEs ( 1 ) : 2.2284s for 90112 events => throughput is 4.04E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.2831s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5823s + [COUNTERS] Fortran MEs ( 1 ) : 3.7008s for 90112 events => throughput is 2.43E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8260s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5438s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2823s for 8192 events => throughput is 2.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3448s for 8192 events => throughput is 2.38E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.5846s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4896s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.0950s for 90112 events => throughput is 2.91E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.7090s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9162s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7928s for 90112 events => throughput is 2.38E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.986081e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.444412e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.975460e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.433186e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354515] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5359s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3966s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1392s for 8192 events => throughput is 5.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5073s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1796s for 8192 events => throughput is 4.56E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354515) differ by less than 3E-14 (2.475797344914099e-14) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607748863) differ by less than 3E-14 (2.453592884421596e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8730s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3402s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.5329s for 90112 events => throughput is 5.88E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.7766s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7859s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9906s for 90112 events => throughput is 4.53E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.083131e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.630294e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.089350e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.629345e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3912s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0657s for 8192 events => throughput is 1.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5178s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4262s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0916s for 8192 events => throughput is 8.94E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.9916s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2710s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7206s for 90112 events => throughput is 1.25E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7021s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6979s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0042s for 90112 events => throughput is 8.97E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.257360e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.244917e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.256779e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.251295e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4953s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 8192 events => throughput is 9.95E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5931s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9014s for 90112 events => throughput is 1.00E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.033892e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.035720e+05 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5700s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4516s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1184s for 8192 events => throughput is 6.92E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.0137s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7172s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2965s for 90112 events => throughput is 6.95E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.144179e+04 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.156532e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354760] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5743s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5667s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7672s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and hip (0.10112317668354760) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.10112748607749111) and cuda (0.10112748607749111) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5845s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5009s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 90112 events => throughput is 1.08E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0397s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0164s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.87E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and hip (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cuda (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115270e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.642318e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136565e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.930638e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679526e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.882259e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.304026e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.244433e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.674473e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.893041e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.843441e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.255841e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.667439e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.907568e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.438819e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.774192e+06 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 5ac8280e80..1a8c36aa43 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-05-15_17:57:53 +DATE: 2024-05-16_01:28:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4838s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2803s - [COUNTERS] Fortran MEs ( 1 ) : 0.2035s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7057s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s + [COUNTERS] Fortran MEs ( 1 ) : 0.3379s for 8192 events => throughput is 2.42E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4590s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2559s - [COUNTERS] Fortran MEs ( 1 ) : 0.2030s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3295s + [COUNTERS] Fortran MEs ( 1 ) : 0.3355s for 8192 events => throughput is 2.44E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4298s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2024s - [COUNTERS] Fortran MEs ( 1 ) : 2.2274s for 90112 events => throughput is 4.05E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.3113s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5981s + [COUNTERS] Fortran MEs ( 1 ) : 3.7132s for 90112 events => throughput is 2.43E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291597608296] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5112s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2613s for 8192 events => throughput is 3.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9881s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3344s for 8192 events => throughput is 2.45E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291597608296) differ by less than 4E-4 (2.5781178285555484e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722621426752) differ by less than 4E-4 (2.569659680817793e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239221732791437E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.3191s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4552s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8638s for 90112 events => throughput is 3.15E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.5951s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9082s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6869s for 90112 events => throughput is 2.44E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239221732791437E-002) differ by less than 4E-4 (1.8599953477416165e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002) differ by less than 4E-4 (1.719182115555995e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.247317e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.531268e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.257409e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.536337e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112290421591680] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3320s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0756s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4297s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0998s for 8192 events => throughput is 8.21E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112290421591680) differ by less than 4E-4 (2.6944132867079418e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720710186394) differ by less than 4E-4 (2.758652844936371e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239212368085274E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.1126s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2753s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8372s for 90112 events => throughput is 1.08E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7829s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6780s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1050s for 90112 events => throughput is 8.16E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239212368085274E-002) differ by less than 4E-4 (3.0418222529693395e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002) differ by less than 4E-4 (3.4258681169685445e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.096350e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.418674e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092149e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.378970e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291415112837] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291415112837) differ by less than 4E-4 (2.5961646764605106e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239211617250407E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.6120s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2368s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3752s for 90112 events => throughput is 2.40E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.1354s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6236s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5118s for 90112 events => throughput is 1.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239211617250407E-002) differ by less than 4E-4 (3.136577692020026e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.453372e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.814902e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.818224e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3704s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0417s for 8192 events => throughput is 1.96E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0774s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6180s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4593s for 90112 events => throughput is 1.96E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.016267e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.459554e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.024583e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0562s for 8192 events => throughput is 1.46E+05 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723387847480) differ by less than 4E-4 (2.4938721023826105e-06) -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.2433s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6150s for 90112 events => throughput is 1.47E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002) differ by less than 4E-4 (2.211270000440635e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.471448e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.496104e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112292787307366] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5497s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5477s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7661s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.57E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and hip (0.10112292787307366) differ by less than 4E-4 (2.4604693221741414e-06) +OK! xsec from fortran (0.10112748607749111) and cuda (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239222545537072E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5151s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0223s for 90112 events => throughput is 4.05E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0217s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0116s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 90112 events => throughput is 8.94E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and hip (7.9239222545537072E-002) differ by less than 4E-4 (1.7574267630049434e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cuda (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.616507e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.279804e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.525364e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.849139e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.469095e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.708780e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.085807e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.376255e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.437114e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.741880e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635195e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.526731e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.395969e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.576787e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.280775e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628936e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b81dae3d11..06cc385635 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-05-15_17:58:20 +DATE: 2024-05-16_01:29:27 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4823s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2798s - [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7042s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3668s + [COUNTERS] Fortran MEs ( 1 ) : 0.3374s for 8192 events => throughput is 2.43E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4570s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2547s - [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6678s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3296s + [COUNTERS] Fortran MEs ( 1 ) : 0.3383s for 8192 events => throughput is 2.42E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4267s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2006s - [COUNTERS] Fortran MEs ( 1 ) : 2.2261s for 90112 events => throughput is 4.05E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.3035s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5927s + [COUNTERS] Fortran MEs ( 1 ) : 3.7109s for 90112 events => throughput is 2.43E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317761225882] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8338s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2903s for 8192 events => throughput is 2.82E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0211s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3509s for 8192 events => throughput is 2.33E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317761225882) differ by less than 2E-4 (9.183959592817814e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700702684) differ by less than 2E-4 (9.191721828116783e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237217958461E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.6852s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4916s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1935s for 90112 events => throughput is 2.82E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.7816s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9205s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8611s for 90112 events => throughput is 2.33E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237217958461E-002) differ by less than 2E-4 (9.4234364755863e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002) differ by less than 2E-4 (9.423232416594374e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.908465e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.398467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.909870e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.396772e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317763556192] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5282s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3911s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1371s for 8192 events => throughput is 5.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6812s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5038s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1774s for 8192 events => throughput is 4.62E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317763556192) differ by less than 2E-4 (9.41440236879032e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805033) differ by less than 2E-4 (9.399612865834683e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237221421968E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8489s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3381s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.5108s for 90112 events => throughput is 5.96E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.6988s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7524s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9464s for 90112 events => throughput is 4.63E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237221421968E-002) differ by less than 2E-4 (9.467145956065792e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002) differ by less than 2E-4 (9.469362849401364e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.149842e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.777911e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.133030e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.770421e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317741957558] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3885s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0662s for 8192 events => throughput is 1.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5041s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4149s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0892s for 8192 events => throughput is 9.19E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317741957558) differ by less than 2E-4 (7.278528668663853e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237072275287E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.9982s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2713s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7269s for 90112 events => throughput is 1.24E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.6526s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6680s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9846s for 90112 events => throughput is 9.15E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237072275287E-002) differ by less than 2E-4 (7.584913142011374e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.271467e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.374488e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.304457e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4867s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 8192 events => throughput is 1.04E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5251s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8699s for 90112 events => throughput is 1.04E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.072957e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.275740e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.074127e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5672s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4475s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1197s for 8192 events => throughput is 6.85E+04 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700265108) differ by less than 2E-4 (9.148451995955043e-09) -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.0147s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7041s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3105s for 90112 events => throughput is 6.88E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002) differ by less than 2E-4 (9.255082034087536e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.810756e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.935663e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317662375726] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5530s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7757s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7703s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and hip (0.10112317662375726) differ by less than 2E-4 (5.9126292750733e-10) +OK! xsec from fortran (0.10112748607749111) and cuda (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236476482192E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5836s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4997s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0839s for 90112 events => throughput is 1.07E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0407s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0176s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 90112 events => throughput is 3.91E+06 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and hip (7.9239236476482192E-002) differ by less than 2E-4 (6.599809587726213e-11) +OK! xsec from fortran (7.9238481932717722E-002) and cuda (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.119340e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.631069e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.150503e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.120692e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.676250e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.856212e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.301160e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.234939e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.678384e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.866138e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.842635e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243613e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.663031e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.862499e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.428966e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.731505e+06 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index e420a06bd8..744dd47e66 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y +make USEBUILDDIR=1 BACKEND=cppavx2 +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-05-15_17:58:50 +DATE: 2024-05-16_01:30:11 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.8398s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3457s - [COUNTERS] Fortran MEs ( 1 ) : 2.4942s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3646s + [COUNTERS] Fortran MEs ( 1 ) : 4.4427s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2715s - [COUNTERS] Fortran MEs ( 1 ) : 2.4969s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.8076s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3603s + [COUNTERS] Fortran MEs ( 1 ) : 4.4473s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0297s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5846s - [COUNTERS] Fortran MEs ( 1 ) : 27.4451s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 50.3676s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1555s + [COUNTERS] Fortran MEs ( 1 ) : 48.2121s for 90112 events => throughput is 1.87E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.8276s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9995s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8280s for 8192 events => throughput is 2.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.2196s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7253s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4943s for 8192 events => throughput is 1.82E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 47.4512s - [COUNTERS] Fortran Overhead ( 0 ) : 5.3002s - [COUNTERS] CudaCpp MEs ( 2 ) : 42.1511s for 90112 events => throughput is 2.14E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 56.2683s + [COUNTERS] Fortran Overhead ( 0 ) : 6.5201s + [COUNTERS] CudaCpp MEs ( 2 ) : 49.7482s for 90112 events => throughput is 1.81E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.217290e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.868635e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.218509e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.874481e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.5861s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9118s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6744s for 8192 events => throughput is 4.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0549s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6552s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3997s for 8192 events => throughput is 3.41E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.5443s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2158s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.3285s for 90112 events => throughput is 4.92E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 30.7729s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4162s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3568s for 90112 events => throughput is 3.42E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.055979e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.598914e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.052931e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.606768e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579728E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6632s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9608s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7024s for 8192 events => throughput is 1.17E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4142s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3728s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0413s for 8192 events => throughput is 7.87E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579728E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.9760s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2687s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.7073s for 90112 events => throughput is 1.17E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 14.6025s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1268s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.4757s for 90112 events => throughput is 7.85E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.207701e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.082204e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.087591e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.1594s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2438s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9156s for 8192 events => throughput is 8.95E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 13.1097s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0095s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.1002s for 90112 events => throughput is 8.92E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.157056e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203855e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.190937e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.7205s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5345s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1860s for 8192 events => throughput is 6.91E+03 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 16.4021s + [COUNTERS] Fortran Overhead ( 0 ) : 3.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0976s for 90112 events => throughput is 6.88E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.943395e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.968493e+03 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579723E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9652s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8508s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1144s for 8192 events => throughput is 7.16E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9037s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and hip (3.8704143122579723E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914653E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3262s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0704s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2558s for 90112 events => throughput is 7.18E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.9957s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6318s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and hip (1.5793532411914653E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793438642451712E-004) differ by less than 3E-14 (0.0) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.211429e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.275863e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.433723e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.513394e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244345e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.126700e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.035029e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.163753e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.238048e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.128674e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230999e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.183392e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.246450e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.130320e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.390998e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.455396e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 31f43ccc9a..97726609cd 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-05-15_18:02:31 +DATE: 2024-05-16_01:34:39 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7732s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2741s - [COUNTERS] Fortran MEs ( 1 ) : 2.4991s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7506s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3591s + [COUNTERS] Fortran MEs ( 1 ) : 4.3916s for 8192 events => throughput is 1.87E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2746s - [COUNTERS] Fortran MEs ( 1 ) : 2.4970s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7327s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3562s + [COUNTERS] Fortran MEs ( 1 ) : 4.3765s for 8192 events => throughput is 1.87E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5824s - [COUNTERS] Fortran MEs ( 1 ) : 27.4644s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 50.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1771s + [COUNTERS] Fortran MEs ( 1 ) : 48.2797s for 90112 events => throughput is 1.87E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704259755238570E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.0056s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5999s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4056s for 8192 events => throughput is 2.41E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.9135s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5714s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3421s for 8192 events => throughput is 1.89E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704259755238570E-004) differ by less than 4E-4 (3.0134411834747965e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703729438336302E-004) differ by less than 4E-4 (3.021119383106452e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580182117605E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 42.3763s - [COUNTERS] Fortran Overhead ( 0 ) : 4.9273s - [COUNTERS] CudaCpp MEs ( 2 ) : 37.4490s for 90112 events => throughput is 2.41E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 54.1190s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3214s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.7976s for 90112 events => throughput is 1.89E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580182117605E-004) differ by less than 4E-4 (3.024668687290344e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004) differ by less than 4E-4 (3.0382263187522796e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.466583e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.947180e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.476652e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.947353e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254541054809E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.9705s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1203s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8502s for 8192 events => throughput is 9.64E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7184s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5261s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1923s for 8192 events => throughput is 6.87E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254541054809E-004) differ by less than 4E-4 (2.8787221757475834e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722581317850E-004) differ by less than 4E-4 (2.843951981690296e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578161882866E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 11.7274s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4156s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.3119s for 90112 events => throughput is 9.68E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 16.5068s + [COUNTERS] Fortran Overhead ( 0 ) : 3.3022s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.2046s for 90112 events => throughput is 6.82E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578161882866E-004) differ by less than 4E-4 (2.896753368286653e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004) differ by less than 4E-4 (2.856718252175483e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.935669e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.978396e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.946862e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.983793e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254166302247E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9852s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6291s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3561s for 8192 events => throughput is 2.30E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5334s for 8192 events => throughput is 1.54E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254166302247E-004) differ by less than 4E-4 (2.8690396836061893e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578009696313E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 5.8306s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9292s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9014s for 90112 events => throughput is 2.31E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 8.5565s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6677s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8888s for 90112 events => throughput is 1.53E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578009696313E-004) differ by less than 4E-4 (2.887117363403746e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387091e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.577584e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.577489e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.2916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8193s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4723s for 8192 events => throughput is 1.73E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 7.7633s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5957s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.1676s for 90112 events => throughput is 1.74E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.818661e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.366035e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.824534e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.5274s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5856s for 8192 events => throughput is 1.40E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703728658657426E-004) differ by less than 4E-4 (3.0009745224379714e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 9.1749s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6987s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4761s for 90112 events => throughput is 1.39E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004) differ by less than 4E-4 (3.0604373708609245e-06) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.413533e+04 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.415193e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704261630635685E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8375s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7812s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0563s for 8192 events => throughput is 1.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8443s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and hip (3.8704261630635685E-004) differ by less than 4E-4 (3.0618958697381515e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580869662166E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 2.6159s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6208s for 90112 events => throughput is 1.45E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8423s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2358s for 90112 events => throughput is 3.82E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and hip (1.5793580869662166E-004) differ by less than 4E-4 (3.0682019858119247e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.470415e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.583101e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.699093e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.931306e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.711466e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.570292e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.298693e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.724382e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.704023e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.573894e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.029112e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.720622e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.702723e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.518798e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.377197e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.538253e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 4ef822ce19..9161616d22 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-05-15_18:05:30 +DATE: 2024-05-16_01:38:10 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2725s - [COUNTERS] Fortran MEs ( 1 ) : 2.4903s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7461s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3596s + [COUNTERS] Fortran MEs ( 1 ) : 4.3864s for 8192 events => throughput is 1.87E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7729s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2757s - [COUNTERS] Fortran MEs ( 1 ) : 2.4972s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7384s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3570s + [COUNTERS] Fortran MEs ( 1 ) : 4.3814s for 8192 events => throughput is 1.87E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0273s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5864s - [COUNTERS] Fortran MEs ( 1 ) : 27.4409s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 50.4226s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1630s + [COUNTERS] Fortran MEs ( 1 ) : 48.2596s for 90112 events => throughput is 1.87E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143272044121E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.9143s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0687s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8455s for 8192 events => throughput is 2.13E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.4283s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8187s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6095s for 8192 events => throughput is 1.78E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143272044121E-004) differ by less than 2E-4 (3.861716058040088e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612659176674E-004) differ by less than 2E-4 (3.851690077993908e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532474032691E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 47.6831s - [COUNTERS] Fortran Overhead ( 0 ) : 5.3827s - [COUNTERS] CudaCpp MEs ( 2 ) : 42.3004s for 90112 events => throughput is 2.13E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 56.9207s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s + [COUNTERS] CudaCpp MEs ( 2 ) : 50.3042s for 90112 events => throughput is 1.79E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532474032691E-004) differ by less than 2E-4 (3.933131154099101e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004) differ by less than 2E-4 (3.930950231989527e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.166106e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.845749e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.162276e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855084e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143304774347E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.5538s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8968s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6570s for 8192 events => throughput is 4.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0818s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6854s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3963s for 8192 events => throughput is 3.42E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143304774347E-004) differ by less than 2E-4 (4.707367828871156e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612692816703E-004) differ by less than 2E-4 (4.720860369289426e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532476698221E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.4115s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1899s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.2217s for 90112 events => throughput is 4.95E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 30.7891s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4469s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3422s for 90112 events => throughput is 3.42E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532476698221E-004) differ by less than 2E-4 (4.101904815811963e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004) differ by less than 2E-4 (4.1013439311399225e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.129046e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.522464e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.113622e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.523204e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143287857844E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9462s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6903s for 8192 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3962s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3623s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0339s for 8192 events => throughput is 7.92E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143287857844E-004) differ by less than 2E-4 (4.2702956726259345e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532473043530E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.8255s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2612s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5644s for 90112 events => throughput is 1.19E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 14.5036s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1328s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.3708s for 90112 events => throughput is 7.92E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532473043530E-004) differ by less than 2E-4 (3.870500364655527e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230886e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.125066e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.074324e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.1502s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2343s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9159s for 8192 events => throughput is 8.94E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 12.9975s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0012s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.9962s for 90112 events => throughput is 9.01E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.351869e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.229330e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.208773e+03 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.7414s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5394s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2020s for 8192 events => throughput is 6.82E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 16.5337s + [COUNTERS] Fortran Overhead ( 0 ) : 3.3003s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.2334s for 90112 events => throughput is 6.81E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.881370e+03 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.943865e+03 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143124638075E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8776s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7626s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1149s for 8192 events => throughput is 7.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8685s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 8192 events => throughput is 2.49E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and hip (3.8704143124638075E-004) differ by less than 2E-4 (5.318190332559425e-11) +OK! xsec from fortran (3.8703612510102356E-004) and cuda (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411887058E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3524s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0921s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2604s for 90112 events => throughput is 7.15E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.9899s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6262s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3637s for 90112 events => throughput is 2.48E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and hip (1.5793532411887058E-004) differ by less than 2E-4 (1.7474910407599964e-12) +OK! xsec from fortran (1.5793438642451712E-004) and cuda (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.190579e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.280457e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.438917e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.523385e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245869e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.121733e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.024461e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.162091e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240706e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.125438e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.227786e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168234e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245800e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.125929e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.380658e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.451563e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index a037fad221..f87c8c9cf1 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-05-15_17:49:11 +DATE: 2024-05-16_01:43:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5801s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4570s - [COUNTERS] Fortran MEs ( 1 ) : 54.1232s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.7235s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5049s + [COUNTERS] Fortran MEs ( 1 ) : 101.2186s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.4417s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3799s - [COUNTERS] Fortran MEs ( 1 ) : 54.0618s for 8192 events => throughput is 1.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.7703s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5109s + [COUNTERS] Fortran MEs ( 1 ) : 101.2594s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 596.9350s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0436s - [COUNTERS] Fortran MEs ( 1 ) : 593.8914s for 90112 events => throughput is 1.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1119.4272s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4297s + [COUNTERS] Fortran MEs ( 1 ) : 1114.9976s for 90112 events => throughput is 8.08E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 175.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 80.1913s - [COUNTERS] CudaCpp MEs ( 2 ) : 95.6777s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 236.4243s + [COUNTERS] Fortran Overhead ( 0 ) : 108.7156s + [COUNTERS] CudaCpp MEs ( 2 ) : 127.7087s for 8192 events => throughput is 6.41E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085453E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1135.3523s - [COUNTERS] Fortran Overhead ( 0 ) : 83.0183s - [COUNTERS] CudaCpp MEs ( 2 ) : 1052.3340s for 90112 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1507.2346s + [COUNTERS] Fortran Overhead ( 0 ) : 112.2012s + [COUNTERS] CudaCpp MEs ( 2 ) : 1395.0334s for 90112 events => throughput is 6.46E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085453E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.027176e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.611342e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001904e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.612518e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 90.5736s - [COUNTERS] Fortran Overhead ( 0 ) : 41.2684s - [COUNTERS] CudaCpp MEs ( 2 ) : 49.3052s for 8192 events => throughput is 1.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 114.8253s + [COUNTERS] Fortran Overhead ( 0 ) : 52.9129s + [COUNTERS] CudaCpp MEs ( 2 ) : 61.9124s for 8192 events => throughput is 1.32E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939197E-006) differ by less than 3E-14 (1.7763568394002505e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085448E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 538.9163s - [COUNTERS] Fortran Overhead ( 0 ) : 40.6550s - [COUNTERS] CudaCpp MEs ( 2 ) : 498.2612s for 90112 events => throughput is 1.81E+02 events/s + [COUNTERS] PROGRAM TOTAL : 734.6001s + [COUNTERS] Fortran Overhead ( 0 ) : 56.8950s + [COUNTERS] CudaCpp MEs ( 2 ) : 677.7051s for 90112 events => throughput is 1.33E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085448E-007) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007) differ by less than 3E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.251988e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.573216e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.245083e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.570652e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 35.5321s - [COUNTERS] Fortran Overhead ( 0 ) : 16.1409s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.3912s for 8192 events => throughput is 4.22E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.5594s + [COUNTERS] Fortran Overhead ( 0 ) : 24.8692s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.6902s for 8192 events => throughput is 2.86E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085445E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 233.9561s - [COUNTERS] Fortran Overhead ( 0 ) : 18.6633s - [COUNTERS] CudaCpp MEs ( 2 ) : 215.2929s for 90112 events => throughput is 4.19E+02 events/s + [COUNTERS] PROGRAM TOTAL : 345.8816s + [COUNTERS] Fortran Overhead ( 0 ) : 28.6165s + [COUNTERS] CudaCpp MEs ( 2 ) : 317.2651s for 90112 events => throughput is 2.84E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085445E-007) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.255758e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.346027e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.397864e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 47.5433s + [COUNTERS] Fortran Overhead ( 0 ) : 21.7991s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7442s for 8192 events => throughput is 3.18E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 310.2994s + [COUNTERS] Fortran Overhead ( 0 ) : 25.7446s + [COUNTERS] CudaCpp MEs ( 2 ) : 284.5548s for 90112 events => throughput is 3.17E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.866314e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.252702e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.859864e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 50.4926s + [COUNTERS] Fortran Overhead ( 0 ) : 24.7479s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7447s for 8192 events => throughput is 3.18E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 313.6701s + [COUNTERS] Fortran Overhead ( 0 ) : 28.5625s + [COUNTERS] CudaCpp MEs ( 2 ) : 285.1075s for 90112 events => throughput is 3.16E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.394651e+02 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.384790e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 11.5992s - [COUNTERS] Fortran Overhead ( 0 ) : 7.7374s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8618s for 8192 events => throughput is 2.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2708s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1879s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0829s for 8192 events => throughput is 7.56E+03 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and hip (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cuda (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085437E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 52.7480s - [COUNTERS] Fortran Overhead ( 0 ) : 10.1979s - [COUNTERS] CudaCpp MEs ( 2 ) : 42.5501s for 90112 events => throughput is 2.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 19.1407s + [COUNTERS] Fortran Overhead ( 0 ) : 7.1896s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9511s for 90112 events => throughput is 7.54E+03 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and hip (2.3322783648085437E-007) differ by less than 3E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.3322993086655967E-007) and cuda (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.140157e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.518899e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.190115e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.266687e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.507331e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.285867e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.458846e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.577065e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.517358e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.302340e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.499481e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.485177e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.517207e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.239249e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115383e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.236704e+03 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 5df6ece655..9938780c0a 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -make USEBUILDDIR=1 BACKEND=hip -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-05-15_18:44:54 +DATE: 2024-05-16_03:16:41 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 55.3069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s - [COUNTERS] Fortran MEs ( 1 ) : 54.9408s for 8192 events => throughput is 1.49E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.6786s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5071s + [COUNTERS] Fortran MEs ( 1 ) : 101.1715s for 8192 events => throughput is 8.10E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5783s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3949s - [COUNTERS] Fortran MEs ( 1 ) : 54.1834s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 102.1420s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5108s + [COUNTERS] Fortran MEs ( 1 ) : 101.6312s for 8192 events => throughput is 8.06E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 630.0786s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1885s - [COUNTERS] Fortran MEs ( 1 ) : 626.8901s for 90112 events => throughput is 1.44E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1119.6489s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4200s + [COUNTERS] Fortran MEs ( 1 ) : 1115.2289s for 90112 events => throughput is 8.08E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,23 +124,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363572559468E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 162.5285s - [COUNTERS] Fortran Overhead ( 0 ) : 74.4601s - [COUNTERS] CudaCpp MEs ( 2 ) : 88.0684s for 8192 events => throughput is 9.30E+01 events/s + [COUNTERS] PROGRAM TOTAL : 207.9761s + [COUNTERS] Fortran Overhead ( 0 ) : 95.5518s + [COUNTERS] CudaCpp MEs ( 2 ) : 112.4243s for 8192 events => throughput is 7.29E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363572559468E-006) differ by less than 4E-4 (0.00013984863241267576) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,37 +158,37 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326080615569212E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1044.5931s - [COUNTERS] Fortran Overhead ( 0 ) : 77.5798s - [COUNTERS] CudaCpp MEs ( 2 ) : 967.0134s for 90112 events => throughput is 9.32E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1342.0233s + [COUNTERS] Fortran Overhead ( 0 ) : 99.5419s + [COUNTERS] CudaCpp MEs ( 2 ) : 1242.4814s for 90112 events => throughput is 7.25E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326080615569212E-007) differ by less than 4E-4 (0.00014136252059526733) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108837e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.627892e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107427e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.617246e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,23 +202,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405361288903015E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 39.5468s - [COUNTERS] Fortran Overhead ( 0 ) : 18.1923s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.3545s for 8192 events => throughput is 3.84E+02 events/s + [COUNTERS] PROGRAM TOTAL : 52.4912s + [COUNTERS] Fortran Overhead ( 0 ) : 24.8093s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.6819s for 8192 events => throughput is 2.96E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405361288903015E-006) differ by less than 4E-4 (0.0001396645204514435) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405717007921116E-006) differ by less than 4E-4 (0.00013961480525170877) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -236,37 +236,37 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326076878598447E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 254.9926s - [COUNTERS] Fortran Overhead ( 0 ) : 20.7416s - [COUNTERS] CudaCpp MEs ( 2 ) : 234.2510s for 90112 events => throughput is 3.85E+02 events/s + [COUNTERS] PROGRAM TOTAL : 333.5578s + [COUNTERS] Fortran Overhead ( 0 ) : 28.7441s + [COUNTERS] CudaCpp MEs ( 2 ) : 304.8137s for 90112 events => throughput is 2.96E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326076878598447E-007) differ by less than 4E-4 (0.00014120229226155523) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007) differ by less than 4E-4 (0.00014114029707035236) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.743171e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.354565e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.623247e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.352919e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -280,23 +280,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405360895331841E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 18.1778s - [COUNTERS] Fortran Overhead ( 0 ) : 8.4615s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.7163s for 8192 events => throughput is 8.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 26.9353s + [COUNTERS] Fortran Overhead ( 0 ) : 12.5805s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.3549s for 8192 events => throughput is 5.71E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405360895331841E-006) differ by less than 4E-4 (0.00013963279012663143) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -314,45 +314,195 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326069099562333E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 117.9708s - [COUNTERS] Fortran Overhead ( 0 ) : 10.8812s - [COUNTERS] CudaCpp MEs ( 2 ) : 107.0896s for 90112 events => throughput is 8.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 175.5244s + [COUNTERS] Fortran Overhead ( 0 ) : 16.6137s + [COUNTERS] CudaCpp MEs ( 2 ) : 158.9107s for 90112 events => throughput is 5.67E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326069099562333E-007) differ by less than 4E-4 (0.00014086875419705436) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.035611e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.796719e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.813418e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 24.0035s + [COUNTERS] Fortran Overhead ( 0 ) : 11.1136s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.8899s for 8192 events => throughput is 6.36E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 155.2962s + [COUNTERS] Fortran Overhead ( 0 ) : 15.0900s + [COUNTERS] CudaCpp MEs ( 2 ) : 140.2063s for 90112 events => throughput is 6.43E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.795563e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043163e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.781198e+02 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 25.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6125s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.7041s for 8192 events => throughput is 6.45E+02 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719306052570E-006) differ by less than 4E-4 (0.00013980007888836354) -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 157.3576s + [COUNTERS] Fortran Overhead ( 0 ) : 16.6484s + [COUNTERS] CudaCpp MEs ( 2 ) : 140.7092s for 90112 events => throughput is 6.40E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007) differ by less than 4E-4 (0.00014108709892313165) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.841550e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.830427e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -364,28 +514,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363557292459E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 6.2175s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3549s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8626s for 8192 events => throughput is 4.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5511s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0591s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4920s for 8192 events => throughput is 1.66E+04 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and hip (1.2405363557292459E-006) differ by less than 4E-4 (0.00013984740156258724) +OK! xsec from fortran (1.2403985227939174E-006) and cuda (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -397,65 +547,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326074784076956E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 27.3553s - [COUNTERS] Fortran Overhead ( 0 ) : 7.0476s - [COUNTERS] CudaCpp MEs ( 2 ) : 20.3077s for 90112 events => throughput is 4.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.4353s + [COUNTERS] Fortran Overhead ( 0 ) : 6.0032s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4321s for 90112 events => throughput is 1.66E+04 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and hip (2.3326074784076956E-007) differ by less than 4E-4 (0.00014111248645076735) +OK! xsec from fortran (2.3322993086655967E-007) and cuda (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.444985e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.630624e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.492484e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.646596e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.263002e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.329013e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.464674e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.359221e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.314870e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.329144e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.016933e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.339287e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259649e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.285838e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.069920e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.423096e+03 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 10d4f0ce63..9cddd5fe7c 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda + make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-05-15_19:29:44 +DATE: 2024-05-16_04:26:13 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.4825s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s - [COUNTERS] Fortran MEs ( 1 ) : 54.1146s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.9892s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5028s + [COUNTERS] Fortran MEs ( 1 ) : 101.4864s for 8192 events => throughput is 8.07E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.3980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3673s - [COUNTERS] Fortran MEs ( 1 ) : 54.0306s for 8192 events => throughput is 1.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.7400s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5055s + [COUNTERS] Fortran MEs ( 1 ) : 101.2345s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 597.9891s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0543s - [COUNTERS] Fortran MEs ( 1 ) : 594.9349s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1119.6356s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4254s + [COUNTERS] Fortran MEs ( 1 ) : 1115.2102s for 90112 events => throughput is 8.08E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629013416990E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 174.8815s - [COUNTERS] Fortran Overhead ( 0 ) : 80.3539s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.5275s for 8192 events => throughput is 8.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 229.4020s + [COUNTERS] Fortran Overhead ( 0 ) : 103.2152s + [COUNTERS] CudaCpp MEs ( 2 ) : 126.1868s for 8192 events => throughput is 6.49E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629013416990E-006) differ by less than 2E-4 (5.7565425759520394e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985299359844E-006) differ by less than 2E-4 (5.7578810608305275e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783773791503E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1131.1467s - [COUNTERS] Fortran Overhead ( 0 ) : 83.5012s - [COUNTERS] CudaCpp MEs ( 2 ) : 1047.6455s for 90112 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1528.1049s + [COUNTERS] Fortran Overhead ( 0 ) : 113.9982s + [COUNTERS] CudaCpp MEs ( 2 ) : 1414.1067s for 90112 events => throughput is 6.37E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783773791503E-007) differ by less than 2E-4 (5.389840573855054e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389404034161771e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026996e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.425842e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.025921e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.948869e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629009850969E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 79.7722s - [COUNTERS] Fortran Overhead ( 0 ) : 35.7094s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.0629s for 8192 events => throughput is 1.86E+02 events/s + [COUNTERS] PROGRAM TOTAL : 117.4242s + [COUNTERS] Fortran Overhead ( 0 ) : 53.6967s + [COUNTERS] CudaCpp MEs ( 2 ) : 63.7275s for 8192 events => throughput is 1.29E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629009850969E-006) differ by less than 2E-4 (5.469044328521022e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985295828471E-006) differ by less than 2E-4 (5.473184350179849e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783784120318E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 520.7034s - [COUNTERS] Fortran Overhead ( 0 ) : 38.1340s - [COUNTERS] CudaCpp MEs ( 2 ) : 482.5694s for 90112 events => throughput is 1.87E+02 events/s + [COUNTERS] PROGRAM TOTAL : 756.7451s + [COUNTERS] Fortran Overhead ( 0 ) : 57.7650s + [COUNTERS] CudaCpp MEs ( 2 ) : 698.9802s for 90112 events => throughput is 1.29E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783784120318E-007) differ by less than 2E-4 (5.832704319530535e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007) differ by less than 2E-4 (5.830713245558172e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.332150e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.540886e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.336891e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.526888e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629007633195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 34.3120s - [COUNTERS] Fortran Overhead ( 0 ) : 15.4276s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.8844s for 8192 events => throughput is 4.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 51.3666s + [COUNTERS] Fortran Overhead ( 0 ) : 23.6472s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.7194s for 8192 events => throughput is 2.96E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629007633195E-006) differ by less than 2E-4 (5.290244020628165e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783783946155E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 225.6923s - [COUNTERS] Fortran Overhead ( 0 ) : 17.9551s - [COUNTERS] CudaCpp MEs ( 2 ) : 207.7372s for 90112 events => throughput is 4.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 334.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 27.7321s + [COUNTERS] CudaCpp MEs ( 2 ) : 306.6693s for 90112 events => throughput is 2.94E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783783946155E-007) differ by less than 2E-4 (5.825236737422301e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.503499e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.517938e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.525606e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 44.9641s + [COUNTERS] Fortran Overhead ( 0 ) : 20.5328s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4313s for 8192 events => throughput is 3.35E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 294.2131s + [COUNTERS] Fortran Overhead ( 0 ) : 24.5260s + [COUNTERS] CudaCpp MEs ( 2 ) : 269.6871s for 90112 events => throughput is 3.34E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.118767e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.531529e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.099496e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 49.6272s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9735s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.6537s for 8192 events => throughput is 3.19E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 308.1385s + [COUNTERS] Fortran Overhead ( 0 ) : 28.1125s + [COUNTERS] CudaCpp MEs ( 2 ) : 280.0260s for 90112 events => throughput is 3.22E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.494217e+02 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.498492e+02 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628931370709E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 12.5547s - [COUNTERS] Fortran Overhead ( 0 ) : 8.2984s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2563s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6127s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7479s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8648s for 8192 events => throughput is 9.47E+03 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and hip (1.2403628931370709E-006) differ by less than 2E-4 (8.581571009358413e-10) +OK! xsec from fortran (1.2403985227939174E-006) and cuda (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783640044522E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 57.4092s - [COUNTERS] Fortran Overhead ( 0 ) : 10.6449s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.7643s for 90112 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.2177s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7356s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4821s for 90112 events => throughput is 9.50E+03 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and hip (2.3322783640044522E-007) differ by less than 2E-4 (3.447657714872321e-10) +OK! xsec from fortran (2.3322993086655967E-007) and cuda (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.942273e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.422089e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.976894e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.074505e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.293371e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108350e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.389005e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.160591e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.289349e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.110190e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.258286e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.113742e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.285627e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112799e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.050475e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.647292e+03 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7219562cec..c909267a2d 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-05-15_18:09:11 +DATE: 2024-05-16_01:42:38 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4435s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4067s + [COUNTERS] Fortran MEs ( 1 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3086s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2604s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4101s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3363s + [COUNTERS] Fortran MEs ( 1 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7352s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2090s - [COUNTERS] Fortran MEs ( 1 ) : 0.5262s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3997s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5919s + [COUNTERS] Fortran MEs ( 1 ) : 0.8079s for 90112 events => throughput is 1.12E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4010s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0703s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4922s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4133s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0789s for 8192 events => throughput is 1.04E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263335) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343820] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0619s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7705s for 90112 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5284s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6490s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8794s for 90112 events => throughput is 1.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343820) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.207564e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.038604e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.207048e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.042402e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166122] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0339s for 8192 events => throughput is 2.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3775s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0428s for 8192 events => throughput is 1.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166122) differ by less than 3E-14 (2.9531932455029164e-14) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262530) differ by less than 3E-14 (2.9531932455029164e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6320s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2568s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 90112 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0947s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6149s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.431395e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.949373e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.437646e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.919896e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2970s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3601s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.35E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2386s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1904s for 90112 events => throughput is 4.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8777s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2756s for 90112 events => throughput is 3.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.773661e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.197689e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.791928e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.341628e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3821s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3591s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8414s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5939s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2475s for 90112 events => throughput is 3.64E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.543777e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.667139e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,30 +428,176 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -Memory access fault by GPU node-4 (Agent handle: 0x63ef3d0) on address 0x14ecb03fa000. Reason: Page not present or supervisor privilege. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14ef468cb90f in ??? -#1 0x14ef46708d2b in ??? -#2 0x14ef4670a3e4 in ??? -#3 0x14ef3dcd1b64 in ??? -#4 0x14ef3dcceb38 in ??? -#5 0x14ef3dc8c496 in ??? -#6 0x14ef468bf6e9 in ??? -#7 0x14ef467d649e in ??? -#8 0xffffffffffffffff in ??? -./madX.sh: line 377: 76073 Aborted (core dumped) $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.4059s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3719s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 8192 events => throughput is 2.41E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 2.0051s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6219s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3832s for 90112 events => throughput is 2.35E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.349902e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.363260e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7762s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7756s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cuda (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 2.0116s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0036s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0080s for 90112 events => throughput is 1.13E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cuda (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.582112e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.121699e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.529721e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.531629e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.538677e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.807286e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.530045e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.783374e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index fc013f36c8..4ac5ec3dc1 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-05-15_18:09:25 +DATE: 2024-05-16_13:45:22 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3546s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3065s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4107s + [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2615s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3412s + [COUNTERS] Fortran MEs ( 1 ) : 0.0749s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7391s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2129s - [COUNTERS] Fortran MEs ( 1 ) : 0.5263s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4361s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6155s + [COUNTERS] Fortran MEs ( 1 ) : 0.8206s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110149549279866] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0568s for 8192 events => throughput is 1.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4145s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0762s for 8192 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110149549279866) differ by less than 4E-4 (2.840326210895583e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463093540638) differ by less than 4E-4 (2.812844174915341e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510678843355344] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9925s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3638s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6287s for 90112 events => throughput is 1.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5059s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8394s for 90112 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510678843355344) differ by less than 4E-4 (4.2350520312872675e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ by less than 4E-4 (1.3172298474195543e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.473999e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.092440e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.470734e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.094603e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110146988852984] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2817s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 8192 events => throughput is 3.03E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110146988852984) differ by less than 4E-4 (2.934771267448788e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459152958460) differ by less than 4E-4 (2.9581965829139634e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676993136629] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2398s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2218s for 90112 events => throughput is 4.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9232s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6228s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3004s for 90112 events => throughput is 3.00E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676993136629) differ by less than 4E-4 (1.2836447871311663e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ by less than 4E-4 (1.6458771667782202e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.153768e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.039638e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.171796e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.034015e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110148793566186] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 8192 events => throughput is 8.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3713s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3577s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.04E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110148793566186) differ by less than 4E-4 (2.8682018052839098e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676419088856] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.3330s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2269s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1061s for 90112 events => throughput is 8.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7502s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6011s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1490s for 90112 events => throughput is 6.05E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676419088856) differ by less than 4E-4 (1.5505111905511626e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.684251e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.923571e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.747759e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.853443e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3638s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3514s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.61E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7444s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6060s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1385s for 90112 events => throughput is 6.51E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.334346e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.476144e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,30 +428,178 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -Memory access fault by GPU node-4 (Agent handle: 0x63ef3f0) on address 0x1484e4b04000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14887af9890f in ??? -#1 0x14887add5d2b in ??? -#2 0x14887add73e4 in ??? -#3 0x14887239eb64 in ??? -#4 0x14887239bb38 in ??? -#5 0x148872359496 in ??? -#6 0x14887af8c6e9 in ??? -#7 0x14887aea349e in ??? -#8 0xffffffffffffffff in ??? -./madX.sh: line 377: 77263 Aborted $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3762s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3583s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 8192 events => throughput is 4.59E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8096s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6129s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1967s for 90112 events => throughput is 4.58E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ by less than 4E-4 (5.3231167917999755e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.750606e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.766894e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7793s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cuda (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 2.0413s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0348s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cuda (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.566939e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.326602e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.593547e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.720103e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.619232e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.806222e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.144615e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.016256e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index dd4212fa37..23f8d1233a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-05-15_18:09:37 +DATE: 2024-05-16_01:43:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3063s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4870s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4120s + [COUNTERS] Fortran MEs ( 1 ) : 0.0750s for 8192 events => throughput is 1.09E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3092s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2610s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4171s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3424s + [COUNTERS] Fortran MEs ( 1 ) : 0.0747s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7403s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2134s - [COUNTERS] Fortran MEs ( 1 ) : 0.5269s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4382s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6232s + [COUNTERS] Fortran MEs ( 1 ) : 0.8150s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005623] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3320s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4232s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0806s for 8192 events => throughput is 1.02E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005623) differ by less than 2E-4 (7.972267290767832e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658835] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0549s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7649s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8899s for 90112 events => throughput is 1.01E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658835) differ by less than 2E-4 (2.0059864880295208e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ by less than 2E-4 (1.967879192932287e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.202899e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.030982e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.204038e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.031715e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005628] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2963s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005628) differ by less than 2E-4 (7.972245086307339e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658832] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6204s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3652s for 90112 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1135s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6453s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4682s for 90112 events => throughput is 1.92E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658832) differ by less than 2E-4 (2.0059842675834716e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ by less than 2E-4 (1.9678769724862377e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.464056e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.925046e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.471288e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.926756e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226530029391] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2991s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3913s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3660s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0253s for 8192 events => throughput is 3.24E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226530029391) differ by less than 2E-4 (7.796884249344771e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679756340242] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4284s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2366s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1918s for 90112 events => throughput is 4.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8962s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6193s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2769s for 90112 events => throughput is 3.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679756340242) differ by less than 2E-4 (9.281064805577444e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.783049e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.266154e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.797269e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.269502e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3831s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3610s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8418s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5992s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2426s for 90112 events => throughput is 3.71E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.789065e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.854777e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,30 +428,177 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -Memory access fault by GPU node-4 (Agent handle: 0x63ef3d0) on address 0x1460b1baa000. Reason: Page not present or supervisor privilege. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14634807990f in ??? -#1 0x146347eb6d2b in ??? -#2 0x146347eb83e4 in ??? -#3 0x14633f47fb64 in ??? -#4 0x14633f47cb38 in ??? -#5 0x14633f43a496 in ??? -#6 0x14634806d6e9 in ??? -#7 0x146347f8449e in ??? -#8 0xffffffffffffffff in ??? -./madX.sh: line 377: 78444 Aborted $timecmd $cmd < ${tmpin} > ${tmp} -ERROR! ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis - - - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.4096s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3737s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0358s for 8192 events => throughput is 2.29E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 2.0275s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6170s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4105s for 90112 events => throughput is 2.20E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.335204e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.335548e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7731s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7724s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.22E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cuda (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 2.0091s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0010s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0081s for 90112 events => throughput is 1.12E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cuda (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632055e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.046951e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.534300e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.533151e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.529948e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.832656e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.532773e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.792603e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** + +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index e5a1b85b65..7b51bb9221 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' + make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' - make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:25:01 +DATE: 2024-05-16_05:58:47 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 1.1089s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0764s - [COUNTERS] Fortran MEs ( 1 ) : 0.0325s for 8192 events => throughput is 2.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9450s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8961s + [COUNTERS] Fortran MEs ( 1 ) : 0.0489s for 8192 events => throughput is 1.68E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6289s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5964s - [COUNTERS] Fortran MEs ( 1 ) : 0.0325s for 8192 events => throughput is 2.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3794s + [COUNTERS] Fortran MEs ( 1 ) : 0.0489s for 8192 events => throughput is 1.67E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x10_fortran > /tmp/valassia/output_heftggbb_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377502] fbridge_mode=0 + [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.7741s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4191s - [COUNTERS] Fortran MEs ( 1 ) : 0.3550s for 90112 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8593s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3225s + [COUNTERS] Fortran MEs ( 1 ) : 0.5368s for 90112 events => throughput is 1.68E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256197] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256148] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6384s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4564s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0431s for 8192 events => throughput is 1.90E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955499256197) differ by less than 3E-14 (4.6629367034256575e-15) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256148) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.8268s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4394s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3874s for 90112 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8316s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4781s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895240377569) differ by less than 3E-14 (3.3306690738754696e-15) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377564) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.414149e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.935091e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396996e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.955688e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256205] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6353s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 8192 events => throughput is 4.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 8192 events => throughput is 3.38E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955499256205) differ by less than 3E-14 (4.218847493575595e-15) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256161) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.6389s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4246s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2143s for 90112 events => throughput is 4.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6048s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3350s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2698s for 90112 events => throughput is 3.34E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895240377569) differ by less than 3E-14 (3.3306690738754696e-15) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377564) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.270507e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.396840e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.282088e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.435183e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256205] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4008s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3863s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955499256205) differ by less than 3E-14 (4.218847493575595e-15) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,114 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377564] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.5526s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4345s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1181s for 90112 events => throughput is 7.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5134s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1705s for 90112 events => throughput is 5.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895240377564) differ by less than 3E-14 (3.1086244689504383e-15) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377560) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.920662e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.293625e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.950346e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.214602e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.3999s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 8192 events => throughput is 6.10E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 + [UNWEIGHT] Wrote 1818 events (found 1823 events) + [COUNTERS] PROGRAM TOTAL : 1.4652s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3183s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1469s for 90112 events => throughput is 6.13E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377560) differ by less than 3E-14 (4.440892098500626e-16) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.964595e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.873725e+05 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +428,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256223] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955499256152] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.9050s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9040s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0010s for 8192 events => throughput is 8.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4159s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3941s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and hip (2.0162955499256223) differ by less than 3E-14 (3.3306690738754696e-15) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955499256152) differ by less than 3E-14 (4.440892098500626e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +461,143 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377577] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895240377560] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.6984s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6873s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0111s for 90112 events => throughput is 8.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.5767s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3320s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2446s for 90112 events => throughput is 3.68E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895240377560) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.692127e+05 ) sec^-1 -OK! xsec from fortran (2.0434895240377502) and hip (2.0434895240377577) differ by less than 3E-14 (3.774758283725532e-15) +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.804832e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0162955499256165] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.8067s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8061s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.36E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! xsec from fortran (2.0162955499256161) and cuda (2.0162955499256165) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.043 [2.0434895240377573] fbridge_mode=1 + [UNWEIGHT] Wrote 1818 events (found 1823 events) + [COUNTERS] PROGRAM TOTAL : 1.7545s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7475s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s + +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0434895240377569) and cuda (2.0434895240377573) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.264685e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.804334e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.969946e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.230995e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540722e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.136468e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.487275e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.810257e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.521984e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.115441e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.481497e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.048562e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.517096e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.115501e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.024626e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.748994e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 5e349cad30..d09b81d7d3 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:25:39 +DATE: 2024-05-16_05:59:14 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9801s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9477s - [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8939s + [COUNTERS] Fortran MEs ( 1 ) : 0.0495s for 8192 events => throughput is 1.66E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6309s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5985s - [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3772s + [COUNTERS] Fortran MEs ( 1 ) : 0.0484s for 8192 events => throughput is 1.69E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x10_fortran > /tmp/valassia/output_heftggbb_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377502] fbridge_mode=0 + [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.7604s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4051s - [COUNTERS] Fortran MEs ( 1 ) : 0.3553s for 90112 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8568s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3199s + [COUNTERS] Fortran MEs ( 1 ) : 0.5369s for 90112 events => throughput is 1.68E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,43 +124,43 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162897089316618] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162897371946169] fbridge_mode=1 [UNWEIGHT] Wrote 1620 events (found 1625 events) - [COUNTERS] PROGRAM TOTAL : 0.6585s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4525s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4113s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162897089316618) differ by less than 4E-4 (2.8968937452189536e-06) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162897371946169) differ by less than 4E-4 (2.8828764708777044e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ! -diff /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 +diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20 6206,6207c6206,6207 -< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.59936081269E+01 0.59936081269E+01 0.00000000000E+00 0. -1. -< 5 1 1 2 501 0 0.45273385612E+02 -0.31131305296E+02 0.47763304668E+03 0.48080583909E+03 0.47000000000E+01 0. 1. +< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.59936081260E+01 0.59936081260E+01 0.00000000000E+00 0. -1. +< 5 1 1 2 501 0 0.45273385612E+02 -0.31131305296E+02 0.47763304676E+03 0.48080583916E+03 0.47000000000E+01 0. 1. --- -> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.59936081269E+01 0.59936081269E+01 0.00000000000E+00 0. 1. -> 5 1 1 2 501 0 0.45273385612E+02 -0.31131305296E+02 0.47763304668E+03 0.48080583909E+03 0.47000000000E+01 0. -1. +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.59936081260E+01 0.59936081260E+01 0.00000000000E+00 0. 1. +> 5 1 1 2 501 0 0.45273385612E+02 -0.31131305296E+02 0.47763304676E+03 0.48080583916E+03 0.47000000000E+01 0. -1. 8306,8307c8306,8307 -< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.23857997230E+02 0.23857997230E+02 0.00000000000E+00 0. 1. -< 5 1 1 2 501 0 -0.34843521722E+02 0.35239303629E+02 0.13219496701E+02 0.51504607748E+02 0.47000000000E+01 0. -1. +< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.23857997239E+02 0.23857997239E+02 0.00000000000E+00 0. 1. +< 5 1 1 2 501 0 -0.34843521722E+02 0.35239303629E+02 0.13219496682E+02 0.51504607743E+02 0.47000000000E+01 0. -1. --- -> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.23857997230E+02 0.23857997230E+02 0.00000000000E+00 0. -1. -> 5 1 1 2 501 0 -0.34843521722E+02 0.35239303629E+02 0.13219496701E+02 0.51504607748E+02 0.47000000000E+01 0. 1. +> 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.23857997239E+02 0.23857997239E+02 0.00000000000E+00 0. -1. +> 5 1 1 2 501 0 -0.34843521722E+02 0.35239303629E+02 0.13219496682E+02 0.51504607743E+02 0.47000000000E+01 0. 1. 9606,9619d9605 < 4 1 1E-03 0.1250139E+03 0.7546771E-02 0.1235066E+00 -< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.94948249861E+03 0.94948249861E+03 0.00000000000E+00 0. 1. -< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.41149990064E+01 0.41149990064E+01 0.00000000000E+00 0. -1. -< 5 1 1 2 501 0 -0.96459450317E+01 -0.34409175043E+02 0.83136584828E+02 0.90613560351E+02 0.47000000000E+01 0. -1. -< -5 1 1 2 0 501 0.96459450317E+01 0.34409175043E+02 0.86223091477E+03 0.86298393726E+03 0.47000000000E+01 0. 1. +< 21 -1 0 0 503 502 0.00000000000E+00 0.00000000000E+00 0.94948250004E+03 0.94948250004E+03 0.00000000000E+00 0. 1. +< 21 -1 0 0 502 503 -0.00000000000E+00 -0.00000000000E+00 -0.41149990002E+01 0.41149990002E+01 0.00000000000E+00 0. -1. +< 5 1 1 2 501 0 -0.96459450317E+01 -0.34409175043E+02 0.83136584965E+02 0.90613560477E+02 0.47000000000E+01 0. -1. +< -5 1 1 2 0 501 0.96459450317E+01 0.34409175043E+02 0.86223091608E+03 0.86298393857E+03 0.47000000000E+01 0. 1. < < 0 0.12501391E+03 diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index cb821d2765..291c38991b 100644 --- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:25:48 +DATE: 2024-05-16_05:59:20 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 3321 events (found 6423 events) - [COUNTERS] PROGRAM TOTAL : 0.9793s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9469s - [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9570s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9073s + [COUNTERS] Fortran MEs ( 1 ) : 0.0498s for 8192 events => throughput is 1.65E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x1_fortran > /tmp/valassia/output_heftggbb_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/avalassi/output_heftggbb_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955499256290] fbridge_mode=0 + [XSECTION] Cross section = 2.016 [2.0162955499256161] fbridge_mode=0 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6302s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5978s - [COUNTERS] Fortran MEs ( 1 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4201s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3715s + [COUNTERS] Fortran MEs ( 1 ) : 0.0486s for 8192 events => throughput is 1.68E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_heftggbb_x10_fortran > /tmp/valassia/output_heftggbb_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x10_fortran > /tmp/avalassi/output_heftggbb_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895240377502] fbridge_mode=0 + [XSECTION] Cross section = 2.043 [2.0434895240377569] fbridge_mode=0 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.7775s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4225s - [COUNTERS] Fortran MEs ( 1 ) : 0.3550s for 90112 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8553s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3195s + [COUNTERS] Fortran MEs ( 1 ) : 0.5358s for 90112 events => throughput is 1.68E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,23 +124,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955975931003] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955975930954] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6670s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6324s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0346s for 8192 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4604s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4161s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955975931003) differ by less than 2E-4 (2.364111328923002e-08) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955975930954) differ by less than 2E-4 (2.3641117063988304e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,23 +158,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895706383704] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895706383660] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.8220s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4411s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3808s for 90112 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8327s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3534s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4793s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895706383704) differ by less than 2E-4 (2.2804433230660948e-08) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895706383660) differ by less than 2E-4 (2.2804427679545825e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -182,15 +182,15 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.395962e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.817766e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.379165e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.799752e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -204,23 +204,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955975931007] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955975930958] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6126s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0187s for 8192 events => throughput is 4.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4227s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3979s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 8192 events => throughput is 3.30E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955975931007) differ by less than 2E-4 (2.3641113511274625e-08) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955975930958) differ by less than 2E-4 (2.364111728603291e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -238,23 +238,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895706383709] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895706383669] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.6210s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6020s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3321s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2699s for 90112 events => throughput is 3.34E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895706383709) differ by less than 2E-4 (2.2804433452705553e-08) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895706383669) differ by less than 2E-4 (2.2804428123635034e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -262,15 +262,15 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.433005e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.208317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.453282e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.242147e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,23 +284,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955953691122] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.6190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6084s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0107s for 8192 events => throughput is 7.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3865s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0153s for 8192 events => throughput is 5.35E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and cpp (2.0162955953691122) differ by less than 2E-4 (2.2538106270175717e-08) +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953691082) differ by less than 2E-4 (2.253811048902321e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -318,23 +318,23 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895701243891] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.5386s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4207s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1179s for 90112 events => throughput is 7.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4998s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1679s for 90112 events => throughput is 5.37E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and cpp (2.0434895701243891) differ by less than 2E-4 (2.255291176034291e-08) +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701243878) differ by less than 2E-4 (2.255290776354002e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -342,23 +342,177 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.909911e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.916130e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.911254e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4012s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.01E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953691082) differ by less than 2E-4 (2.253811048902321e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 + [UNWEIGHT] Wrote 1818 events (found 1823 events) + [COUNTERS] PROGRAM TOTAL : 1.4747s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3226s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1520s for 90112 events => throughput is 5.93E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701243878) differ by less than 2E-4 (2.255290776354002e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.307270e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.967731e+05 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.162265e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.016 [2.0162955953691082] fbridge_mode=1 + [UNWEIGHT] Wrote 1617 events (found 1622 events) + [COUNTERS] PROGRAM TOTAL : 0.4179s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 8192 events => throughput is 3.55E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.0162955499256161) and cpp (2.0162955953691082) differ by less than 2E-4 (2.253811048902321e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.043 [2.0434895701243878] fbridge_mode=1 + [UNWEIGHT] Wrote 1818 events (found 1823 events) + [COUNTERS] PROGRAM TOTAL : 1.5949s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3393s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2556s for 90112 events => throughput is 3.53E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (2.0434895240377569) and cpp (2.0434895701243878) differ by less than 2E-4 (2.255290776354002e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.332654e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.349365e+05 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -370,28 +524,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x1_cudacpp > /tmp/valassia/output_heftggbb_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.016 [2.0162955503257880] fbridge_mode=1 + [XSECTION] Cross section = 2.016 [2.0162955503257827] fbridge_mode=1 [UNWEIGHT] Wrote 1617 events (found 1622 events) - [COUNTERS] PROGRAM TOTAL : 0.8865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8854s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0011s for 8192 events => throughput is 7.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.8124s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8118s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0162955499256290) and hip (2.0162955503257880) differ by less than 2E-4 (1.9846257970357328e-10) +OK! xsec from fortran (2.0162955499256161) and cuda (2.0162955503257827) differ by less than 2E-4 (1.9846613241725208e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -403,65 +557,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_heftggbb_x10_cudacpp > /tmp/valassia/output_heftggbb_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_heftggbb_x10_cudacpp > /tmp/avalassi/output_heftggbb_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.043 [2.0434895242795763] fbridge_mode=1 + [XSECTION] Cross section = 2.043 [2.0434895242795732] fbridge_mode=1 [UNWEIGHT] Wrote 1818 events (found 1823 events) - [COUNTERS] PROGRAM TOTAL : 4.7094s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6983s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0111s for 90112 events => throughput is 8.11E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.7514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7444s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.0434895240377502) and hip (2.0434895242795763) differ by less than 2E-4 (1.1833978241782006e-10) +OK! xsec from fortran (2.0434895240377569) and cuda (2.0434895242795732) differ by less than 2E-4 (1.183348974365117e-10) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.209592e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.815756e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.960419e+06 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.247313e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541807e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.109600e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.490798e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.657132e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541143e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.111219e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.479881e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.039865e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.505237e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.111200e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.033287e+07 ) sec^-1 +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.771983e+07 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 660bc06eac..80269e77b1 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx + +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=hip make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y - +make USEBUILDDIR=1 BACKEND=cppsse4 +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:52 +DATE: 2024-05-16_06:00:18 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 1.7225s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s - [COUNTERS] Fortran MEs ( 1 ) : 1.3858s for 8192 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6807s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3392s + [COUNTERS] Fortran MEs ( 1 ) : 2.3415s for 8192 events => throughput is 3.50E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.6429s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2567s - [COUNTERS] Fortran MEs ( 1 ) : 1.3862s for 8192 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6731s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3387s + [COUNTERS] Fortran MEs ( 1 ) : 2.3344s for 8192 events => throughput is 3.51E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x10_fortran > /tmp/valassia/output_smeftggtttt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438261E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 16.5858s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3354s - [COUNTERS] Fortran MEs ( 1 ) : 15.2503s for 90112 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.7347s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8315s + [COUNTERS] Fortran MEs ( 1 ) : 25.9031s for 90112 events => throughput is 3.48E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084349E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 4.3367s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3009s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.0358s for 8192 events => throughput is 4.02E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3198s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7672s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5526s for 8192 events => throughput is 3.21E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896697955084349E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896697955084454E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438187E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 25.7402s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3444s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.3958s for 90112 events => throughput is 4.02E+03 events/s + [COUNTERS] PROGRAM TOTAL : 31.9550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2173s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.7377s for 90112 events => throughput is 3.25E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668083551438230E-007) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668083551438187E-007) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107270e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.410579e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.064003e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.412797e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084317E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084412E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.0924s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9017s for 8192 events => throughput is 9.08E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.9225s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6149s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.3076s for 8192 events => throughput is 6.27E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896697955084317E-007) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896697955084412E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438272E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 12.1607s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2304s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.9302s for 90112 events => throughput is 9.07E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.4719s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0716s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.4004s for 90112 events => throughput is 6.26E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668083551438272E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668083551438230E-007) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.284013e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.486190e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.297424e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.480385e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084359E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.0703s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6717s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3985s for 8192 events => throughput is 2.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4971s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9161s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5810s for 8192 events => throughput is 1.41E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896697955084359E-007) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896697955084454E-007) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438261E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 6.0923s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7291s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3632s for 90112 events => throughput is 2.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.7948s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3698s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4250s for 90112 events => throughput is 1.40E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668083551438261E-007) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668083551438198E-007) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.120327e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.446474e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.453779e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 1.3443s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8339s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5104s for 8192 events => throughput is 1.60E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896697955084454E-007) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 8.0840s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3189s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7651s for 90112 events => throughput is 1.56E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668083551438198E-007) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.657111e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118513e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.653778e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 1.7114s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6926s for 8192 events => throughput is 1.18E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896697955084454E-007) differ by less than 3E-14 (2.220446049250313e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 10.1275s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5284s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.5992s for 90112 events => throughput is 1.19E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668083551438198E-007) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.225114e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.220944e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084359E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084454E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.8518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8019s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0499s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.80E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and hip (7.9896697955084359E-007) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (7.9896697955084444E-007) and cuda (7.9896697955084454E-007) differ by less than 3E-14 (2.220446049250313e-16) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_d_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438272E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438198E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.3012s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7529s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5484s for 90112 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4753s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1885s for 90112 events => throughput is 4.78E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and hip (7.6668083551438272E-007) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.6668083551438230E-007) and cuda (7.6668083551438198E-007) differ by less than 3E-14 (4.440892098500626e-16) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.671302e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.843300e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.658395e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.218089e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.213705e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.155842e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.753070e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.421655e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.214795e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.190444e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.223896e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.415946e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.210300e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.148629e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.134102e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.764175e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index e510c954ab..45b154f6da 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:28:46 +DATE: 2024-05-16_06:02:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 1.6441s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2590s - [COUNTERS] Fortran MEs ( 1 ) : 1.3851s for 8192 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3400s + [COUNTERS] Fortran MEs ( 1 ) : 2.3294s for 8192 events => throughput is 3.52E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.6534s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2614s - [COUNTERS] Fortran MEs ( 1 ) : 1.3921s for 8192 events => throughput is 5.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6748s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3390s + [COUNTERS] Fortran MEs ( 1 ) : 2.3358s for 8192 events => throughput is 3.51E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x10_fortran > /tmp/valassia/output_smeftggtttt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438261E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 16.6000s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3362s - [COUNTERS] Fortran MEs ( 1 ) : 15.2638s for 90112 events => throughput is 5.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.4771s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8059s + [COUNTERS] Fortran MEs ( 1 ) : 25.6712s for 90112 events => throughput is 3.51E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896781657409323E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896785213255034E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 3.9769s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0958s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8811s for 8192 events => throughput is 4.35E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0871s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6795s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4076s for 8192 events => throughput is 3.40E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896781657409323E-007) differ by less than 4E-4 (1.0476318432761644e-06) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896785213255034E-007) differ by less than 4E-4 (1.0921373827521563e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668135917139758E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668138359550833E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 23.8633s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1760s - [COUNTERS] CudaCpp MEs ( 2 ) : 20.6873s for 90112 events => throughput is 4.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6384s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1272s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.5112s for 90112 events => throughput is 3.40E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668135917139758E-007) differ by less than 4E-4 (6.830182661676787e-07) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668138359550833E-007) differ by less than 4E-4 (7.148752136920677e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.500510e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.518646e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.505177e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.519140e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896768403674554E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896766542858863E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.2268s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7483s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4785s for 8192 events => throughput is 1.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.6928s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6797s for 8192 events => throughput is 1.21E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896768403674554E-007) differ by less than 4E-4 (8.817459542509454e-07) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896766542858863E-007) differ by less than 4E-4 (8.584556829838164e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668124704275563E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668121906848987E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 7.0749s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8123s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.2627s for 90112 events => throughput is 1.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 9.9205s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4645s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.4560s for 90112 events => throughput is 1.21E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668124704275563E-007) differ by less than 4E-4 (5.367662185840061e-07) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668121906848987E-007) differ by less than 4E-4 (5.002787206720427e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.746537e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.232964e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.753023e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231409e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,22 +276,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896764767194441E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.6877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4818s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 8192 events => throughput is 3.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2988s for 8192 events => throughput is 2.74E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896764767194441E-007) differ by less than 4E-4 (8.362311807452727e-07) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896764408326359E-007) differ by less than 4E-4 (8.31739528805997e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,44 +309,190 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668127564727394E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 3.8088s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5452s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2636s for 90112 events => throughput is 3.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3691s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0900s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.2791s for 90112 events => throughput is 2.75E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668127564727394E-007) differ by less than 4E-4 (5.740757704764121e-07) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668124799901306E-007) differ by less than 4E-4 (5.380134884269694e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111165e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.797581e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.799365e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896764408326359E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 0.8582s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5958s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2624s for 8192 events => throughput is 3.12E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896764408326359E-007) differ by less than 4E-4 (8.31739528805997e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668124799901306E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 4.9443s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0456s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8987s for 90112 events => throughput is 3.11E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668124799901306E-007) differ by less than 4E-4 (5.380134884269694e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.232867e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110104e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.231460e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896778056937195E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 1.0285s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6828s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896778056937195E-007) differ by less than 4E-4 (1.0025677505964836e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668139178203571E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 5.9990s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1683s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8307s for 90112 events => throughput is 2.35E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668139178203571E-007) differ by less than 4E-4 (7.255530953820255e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.399591e+04 ) sec^-1 -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.392653e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896777191982386E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896805369365078E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.6564s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6304s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0260s for 8192 events => throughput is 3.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8276s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0140s for 8192 events => throughput is 5.86E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and hip (7.9896777191982386E-007) differ by less than 4E-4 (9.917418370974929e-07) +OK! xsec from fortran (7.9896697955084444E-007) and cuda (7.9896805369365078E-007) differ by less than 4E-4 (1.3444145174901223e-06) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_f_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668141007936531E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668194616292154E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 1.9498s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6655s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2843s for 90112 events => throughput is 3.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4309s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2769s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1540s for 90112 events => throughput is 5.85E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and hip (7.6668141007936531E-007) differ by less than 4E-4 (7.494187359569082e-07) +OK! xsec from fortran (7.6668083551438230E-007) and cuda (7.6668194616292154E-007) differ by less than 4E-4 (1.4486452351025747e-06) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.236838e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.229528e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.235081e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.512458e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.176113e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.376887e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.011828e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.385341e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174356e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.353397e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.963341e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.396782e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.170684e+06 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.372639e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.676487e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.860961e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 39374e1be7..66daeb0e97 100644 --- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:30:24 +DATE: 2024-05-16_06:05:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 1041 events) - [COUNTERS] PROGRAM TOTAL : 1.6769s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2915s - [COUNTERS] Fortran MEs ( 1 ) : 1.3854s for 8192 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7059s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3465s + [COUNTERS] Fortran MEs ( 1 ) : 2.3593s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x1_fortran > /tmp/valassia/output_smeftggtttt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /tmp/avalassi/output_smeftggtttt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697955084370E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.99e-07 [7.9896697955084444E-007] fbridge_mode=0 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.6679s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] Fortran MEs ( 1 ) : 1.3856s for 8192 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7035s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s + [COUNTERS] Fortran MEs ( 1 ) : 2.3597s for 8192 events => throughput is 3.47E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_smeftggtttt_x10_fortran > /tmp/valassia/output_smeftggtttt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x10_fortran > /tmp/avalassi/output_smeftggtttt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551438261E-007] fbridge_mode=0 + [XSECTION] Cross section = 7.667e-07 [7.6668083551438230E-007] fbridge_mode=0 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 16.5784s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3357s - [COUNTERS] Fortran MEs ( 1 ) : 15.2427s for 90112 events => throughput is 5.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 27.5770s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8193s + [COUNTERS] Fortran MEs ( 1 ) : 25.7577s for 90112 events => throughput is 3.50E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,22 +124,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896696375074458E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896696375074447E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 4.3391s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2879s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.0512s for 8192 events => throughput is 3.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.3005s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7718s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.5286s for 8192 events => throughput is 3.24E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896696375074458E-007) differ by less than 2E-4 (1.9775659776399834e-08) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896696375074447E-007) differ by less than 2E-4 (1.9775660775600556e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -157,36 +157,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668081976882384E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668081976882373E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 26.0217s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3613s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.6604s for 90112 events => throughput is 3.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 31.9857s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2191s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.7666s for 90112 events => throughput is 3.25E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668081976882384E-007) differ by less than 2E-4 (2.0537305855938826e-08) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668081976882373E-007) differ by less than 2E-4 (2.0537305522871918e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.091466e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.386546e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.087797e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.397002e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -200,22 +200,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896696285825699E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896696285825688E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 2.0554s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1572s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8982s for 8192 events => throughput is 9.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8744s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5934s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2810s for 8192 events => throughput is 6.40E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896696285825699E-007) differ by less than 2E-4 (2.0892711671827158e-08) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896696285825688E-007) differ by less than 2E-4 (2.089271267102788e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -233,36 +233,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668081890954439E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668081890954375E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 12.1084s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2445s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.8639s for 90112 events => throughput is 9.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 17.1795s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0414s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.1381s for 90112 events => throughput is 6.37E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668081890954439E-007) differ by less than 2E-4 (2.1658084325970606e-08) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668081890954375E-007) differ by less than 2E-4 (2.1658084770059816e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.341538e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.678262e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.375998e+03 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.743588e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -276,8 +276,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -285,13 +285,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 1.0608s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6632s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3976s for 8192 events => throughput is 2.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5764s for 8192 events => throughput is 1.42E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and cpp (7.9896696427369838E-007) differ by less than 2E-4 (1.912112224111695e-08) +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896696427369838E-007) differ by less than 2E-4 (1.9121123240317672e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -309,8 +309,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -318,35 +318,181 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_smeftggt [XSECTION] ChannelId = 1 [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 6.0854s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7321s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3533s for 90112 events => throughput is 2.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.7035s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3540s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.3495s for 90112 events => throughput is 1.42E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and cpp (7.6668082030339872E-007) differ by less than 2E-4 (1.984004716071297e-08) +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668082030339872E-007) differ by less than 2E-4 (1.984004671662376e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.116124e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.454724e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.476512e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 1.3378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5064s for 8192 events => throughput is 1.62E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896696427369838E-007) differ by less than 2E-4 (1.9121123240317672e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 7.8877s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3042s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.5835s for 90112 events => throughput is 1.61E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668082030339872E-007) differ by less than 2E-4 (1.984004671662376e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.679497e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.120997e+04 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.670221e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.99e-07 [7.9896696427369838E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 215 events (found 963 events) + [COUNTERS] PROGRAM TOTAL : 1.7201s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0244s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6957s for 8192 events => throughput is 1.18E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9896697955084444E-007) and cpp (7.9896696427369838E-007) differ by less than 2E-4 (1.9121123240317672e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 7.667e-07 [7.6668082030339872E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 1700 events (found 1705 events) + [COUNTERS] PROGRAM TOTAL : 10.1095s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4855s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.6240s for 90112 events => throughput is 1.18E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (7.6668083551438230E-007) and cpp (7.6668082030339872E-007) differ by less than 2E-4 (1.984004671662376e-08) -*** (3-cuda) WARNING! SKIP MADEVENT_CUDA (cuda is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (3-hip) EXECUTE MADEVENT_HIP x1 (create events.lhe) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.204025e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.203510e+04 ) sec^-1 + +*** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -358,28 +504,28 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x1_cudacpp > /tmp/valassia/output_smeftggtttt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x1_cudacpp > /tmp/avalassi/output_smeftggtttt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.99e-07 [7.9896697918297697E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.99e-07 [7.9896697918297644E-007] fbridge_mode=1 [UNWEIGHT] Wrote 215 events (found 963 events) - [COUNTERS] PROGRAM TOTAL : 0.7505s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7005s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0500s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8192s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9896697955084370E-007) and hip (7.9896697918297697E-007) differ by less than 2E-4 (4.604279180142612e-10) +OK! xsec from fortran (7.9896697955084444E-007) and cuda (7.9896697918297644E-007) differ by less than 2E-4 (4.6042958334879813e-10) -*** (3-hip) Compare MADEVENT_HIP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.1 and events.lhe.ref.1 are identical +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical -*** (3-hip) EXECUTE MADEVENT_HIP x10 (create events.lhe) *** +*** (3-cuda) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -391,65 +537,67 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.hip_m_inl0_hrd0/madevent_hip < /tmp/valassia/input_smeftggtttt_x10_cudacpp > /tmp/valassia/output_smeftggtttt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.cuda_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_smeftggtttt_x10_cudacpp > /tmp/avalassi/output_smeftggtttt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 7.667e-07 [7.6668083551547613E-007] fbridge_mode=1 + [XSECTION] Cross section = 7.667e-07 [7.6668083551547592E-007] fbridge_mode=1 [UNWEIGHT] Wrote 1700 events (found 1705 events) - [COUNTERS] PROGRAM TOTAL : 2.2767s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7276s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5491s for 90112 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4662s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2768s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1894s for 90112 events => throughput is 4.76E+05 events/s -*** (3-hip) Compare MADEVENT_HIP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (3-cuda) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.6668083551438261E-007) and hip (7.6668083551547613E-007) differ by less than 2E-4 (1.426192497433476e-12) +OK! xsec from fortran (7.6668083551438230E-007) and cuda (7.6668083551547592E-007) differ by less than 2E-4 (1.4264145420384011e-12) -*** (3-hip) Compare MADEVENT_HIP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (3-cuda) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -OK! events.lhe.hip.10 and events.lhe.ref.10 are identical +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.665614e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.814879e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.657467e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.185918e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.209272e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.154361e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.748491e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.382253e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.206625e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.160102e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.213329e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.387193e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.205557e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.108981e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.132251e+05 ) sec^-1 +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.750323e+05 ) sec^-1 + +*** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) *** TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index be43c59a71..059122dda6 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -make USEBUILDDIR=1 BACKEND=hip -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:40 +DATE: 2024-05-16_06:00:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,8 +49,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.4893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4835s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4148s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4054s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,8 +74,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.2460s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2402s - [COUNTERS] Fortran MEs ( 1 ) : 0.0058s for 8192 events => throughput is 1.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3059s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.68E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x10_fortran > /tmp/valassia/output_susyggt1t1_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826017] fbridge_mode=0 + [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.0929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0305s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3773s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2771s + [COUNTERS] Fortran MEs ( 1 ) : 0.1002s for 90112 events => throughput is 8.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,15 +124,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] ERROR! No cross section in log file: - /tmp/valassia/output_susyggt1t1_x1_cudacpp + /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... xqcutij # 3> 0.0 0.0 RESET CUMULATIVE VARIABLE diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 3b008415cf..01167da954 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -make USEBUILDDIR=1 BACKEND=hip -make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cuda +make USEBUILDDIR=1 BACKEND=cppnone +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:44 +DATE: 2024-05-16_06:00:09 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,8 +49,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.3149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3091s - [COUNTERS] Fortran MEs ( 1 ) : 0.0058s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4004s + [COUNTERS] Fortran MEs ( 1 ) : 0.0092s for 8192 events => throughput is 8.95E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,8 +74,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.2480s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2421s - [COUNTERS] Fortran MEs ( 1 ) : 0.0058s for 8192 events => throughput is 1.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3199s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3104s + [COUNTERS] Fortran MEs ( 1 ) : 0.0095s for 8192 events => throughput is 8.61E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x10_fortran > /tmp/valassia/output_susyggt1t1_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826017] fbridge_mode=0 + [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.1174s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0550s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3992s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2973s + [COUNTERS] Fortran MEs ( 1 ) : 0.1018s for 90112 events => throughput is 8.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,15 +124,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] ERROR! No cross section in log file: - /tmp/valassia/output_susyggt1t1_x1_cudacpp + /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... xqcutij # 3> 0.0 0.0 RESET CUMULATIVE VARIABLE diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 41f5f3e87b..6c876298cd 100644 --- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:48 +DATE: 2024-05-16_06:00:14 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,8 +49,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 685 events (found 2208 events) - [COUNTERS] PROGRAM TOTAL : 0.3111s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s - [COUNTERS] Fortran MEs ( 1 ) : 0.0058s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4020s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,8 +74,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tmp/valassia/output_susyggt1t1_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tmp/avalassi/output_susyggt1t1_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x1_fortran > /tm [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.4876 [0.48763077179780701] fbridge_mode=0 [UNWEIGHT] Wrote 648 events (found 1275 events) - [COUNTERS] PROGRAM TOTAL : 0.2484s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2426s - [COUNTERS] Fortran MEs ( 1 ) : 0.0058s for 8192 events => throughput is 1.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3138s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s + [COUNTERS] Fortran MEs ( 1 ) : 0.0094s for 8192 events => throughput is 8.67E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggt1t1_x10_fortran > /tmp/valassia/output_susyggt1t1_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x10_fortran > /tmp/avalassi/output_susyggt1t1_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.4762 [0.47620722822826017] fbridge_mode=0 + [XSECTION] Cross section = 0.4762 [0.47620722822826000] fbridge_mode=0 [UNWEIGHT] Wrote 1784 events (found 1789 events) - [COUNTERS] PROGRAM TOTAL : 1.0995s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0371s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3758s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2758s + [COUNTERS] Fortran MEs ( 1 ) : 0.1001s for 90112 events => throughput is 9.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,15 +124,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggt1t1_x1_cudacpp > /tmp/valassia/output_susyggt1t1_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggt1t1_x1_cudacpp > /tmp/avalassi/output_susyggt1t1_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/4 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] ERROR! No cross section in log file: - /tmp/valassia/output_susyggt1t1_x1_cudacpp + /tmp/avalassi/output_susyggt1t1_x1_cudacpp ... xqcutij # 3> 0.0 0.0 RESET CUMULATIVE VARIABLE diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 70943a347c..fd24a61552 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip + +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 - make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:25 +DATE: 2024-05-16_05:59:47 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.6804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6518s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8237s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7798s + [COUNTERS] Fortran MEs ( 1 ) : 0.0440s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3179s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2894s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3754s + [COUNTERS] Fortran MEs ( 1 ) : 0.0437s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x10_fortran > /tmp/valassia/output_susyggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=0 + [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.3369s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0254s - [COUNTERS] Fortran MEs ( 1 ) : 0.3115s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8130s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3278s + [COUNTERS] Fortran MEs ( 1 ) : 0.4852s for 90112 events => throughput is 1.86E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,19 +124,19 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81273026311092] fbridge_mode=1 + [XSECTION] Cross section = 171.8 [171.81273026311101] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.5519s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5198s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6611s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419849) and cpp (171.81273026311092) differ by more than 3E-14 (2.8524018329411867) +ERROR! xsec from fortran (44.598860065419856) and cpp (171.81273026311101) differ by more than 3E-14 (2.852401832941188) diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 425a8480ec..293718b73f 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppsse4 make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:30 +DATE: 2024-05-16_05:59:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.5880s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5595s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8342s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7896s + [COUNTERS] Fortran MEs ( 1 ) : 0.0446s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2922s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4178s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3733s + [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x10_fortran > /tmp/valassia/output_susyggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=0 + [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.3417s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0294s - [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8125s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3299s + [COUNTERS] Fortran MEs ( 1 ) : 0.4826s for 90112 events => throughput is 1.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,19 +124,19 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81269679287095] fbridge_mode=1 + [XSECTION] Cross section = 171.8 [171.81270286137041] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.5181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4904s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0276s for 8192 events => throughput is 2.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7025s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6657s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0367s for 8192 events => throughput is 2.23E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419849) and cpp (171.81269679287095) differ by more than 4E-4 (2.8524010824681945) +ERROR! xsec from fortran (44.598860065419856) and cpp (171.81270286137041) differ by more than 4E-4 (2.8524012185366816) diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 1003afbf74..f9ac9cdc3d 100644 --- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,41 +1,41 @@ -Working directory (build): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -make USEBUILDDIR=1 BACKEND=hip +make USEBUILDDIR=1 BACKEND=cuda make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 BACKEND=cppsse4 -make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cppsse4 +make USEBUILDDIR=1 BACKEND=cppavx2 make USEBUILDDIR=1 BACKEND=cpp512y +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-05-15_20:26:35 +DATE: 2024-05-16_05:59:59 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -49,18 +49,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 2620 events (found 5403 events) - [COUNTERS] PROGRAM TOTAL : 0.5896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5608s - [COUNTERS] Fortran MEs ( 1 ) : 0.0288s for 8192 events => throughput is 2.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8401s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7957s + [COUNTERS] Fortran MEs ( 1 ) : 0.0444s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -74,18 +74,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x1_fortran > /tmp/valassia/output_susyggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/avalassi/output_susyggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.6 [44.598860065419849] fbridge_mode=0 + [XSECTION] Cross section = 44.6 [44.598860065419856] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2910s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4268s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3820s + [COUNTERS] Fortran MEs ( 1 ) : 0.0448s for 8192 events => throughput is 1.83E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -99,18 +99,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_susyggtt_x10_fortran > /tmp/valassia/output_susyggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x10_fortran > /tmp/avalassi/output_susyggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 44.58 [44.577523870256471] fbridge_mode=0 + [XSECTION] Cross section = 44.58 [44.577523870256456] fbridge_mode=0 [UNWEIGHT] Wrote 1743 events (found 1748 events) - [COUNTERS] PROGRAM TOTAL : 1.3423s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0301s - [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8476s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3594s + [COUNTERS] Fortran MEs ( 1 ) : 0.4881s for 90112 events => throughput is 1.85E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -124,19 +124,19 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_susyggtt_x1_cudacpp > /tmp/valassia/output_susyggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_susyggtt_x1_cudacpp > /tmp/avalassi/output_susyggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 171.8 [171.81273490068895] fbridge_mode=1 + [XSECTION] Cross section = 171.8 [171.81273490068889] fbridge_mode=1 [UNWEIGHT] Wrote 2338 events (found 3965 events) - [COUNTERS] PROGRAM TOTAL : 0.5275s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7032s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6624s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (44.598860065419849) and cpp (171.81273490068895) differ by more than 2E-4 (2.8524019369254145) +ERROR! xsec from fortran (44.598860065419856) and cpp (171.81273490068889) differ by more than 2E-4 (2.8524019369254128) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index 0926cf6223..eaeaf654fc 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -224,7 +224,7 @@ function getnevt() echo $nevt } -# Determine the appropriate CUDA/HIP grid dimension for the specific process (to run the fastest gcheck) +# Determine the appropriate CUDA/HIP grid dimension for the specific process (to run the fastest check_cuda or check_hip) function getgridmax() { if [ "${eemumu}" == "1" ]; then @@ -313,14 +313,14 @@ EOF echo ${tmp} } -# Run check.exe or gcheck.exe (depending on $1) and parse its output +# Run check_(cpp|cuda|hip).exe (depending on $1) and parse its output function runcheck() { - if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage: runcheck "; exit 1; fi + if [ "$1" == "" ] || [ "$2" != "" ]; then echo "Usage: runcheck "; exit 1; fi cmd=$1 if [ "${cmd/gcheckmax128thr}" != "$cmd" ]; then txt="GCHECK(MAX128THR)" - cmd=${cmd/gcheckmax128thr/gcheck} # hack: run cuda/hip gcheck with tput fastest settings + cmd=${cmd/gcheckmax128thr/check_${backend}} # hack: run cuda/hip check with tput fastest settings cmd=${cmd/.\//.\/build.${backend}_${fptype}_inl0_hrd0\/} nblk=$(getgridmax | cut -d ' ' -f1) nthr=$(getgridmax | cut -d ' ' -f2) @@ -328,7 +328,7 @@ function runcheck() (( nevt = nblk*nthr )) elif [ "${cmd/gcheckmax8thr}" != "$cmd" ]; then txt="GCHECK(MAX8THR)" - cmd=${cmd/gcheckmax8thr/gcheck} # hack: run cuda/hip gcheck with tput fastest settings + cmd=${cmd/gcheckmax8thr/check_${backend}} # hack: run cuda/hip check with tput fastest settings cmd=${cmd/.\//.\/build.${backend}_${fptype}_inl0_hrd0\/} nblk=$(getgridmax | cut -d ' ' -f1) nthr=$(getgridmax | cut -d ' ' -f2) @@ -336,13 +336,14 @@ function runcheck() (( nevt = nblk*nthr )) elif [ "${cmd/gcheckmax}" != "$cmd" ]; then txt="GCHECK(MAX)" - cmd=${cmd/gcheckmax/gcheck} # hack: run cuda/hip gcheck with tput fastest settings + cmd=${cmd/gcheckmax/check_${backend}} # hack: run cuda/hip check with tput fastest settings cmd=${cmd/.\//.\/build.${backend}_${fptype}_inl0_hrd0\/} nblk=$(getgridmax | cut -d ' ' -f1) nthr=$(getgridmax | cut -d ' ' -f2) (( nevt = nblk*nthr )) elif [ "${cmd/gcheck}" != "$cmd" ]; then txt="GCHECK($NLOOP)" + cmd=${cmd/gcheck/check_${backend}} cmd=${cmd/.\//.\/build.${backend}_${fptype}_inl0_hrd0\/} nthr=32 (( nblk = NLOOP/nthr )) || true # integer division (NB: bash double parenthesis fails if the result is 0) @@ -351,6 +352,7 @@ function runcheck() nevt=$(getnevt) elif [ "${cmd/check}" != "$cmd" ]; then txt="CHECK($NLOOP)" + cmd=${cmd/check/check_cpp} cmd=${cmd/.\//.\/build.${backend}_${fptype}_inl0_hrd0\/} nthr=32 (( nblk = NLOOP/nthr )) || true # integer division (NB: bash double parenthesis fails if the result is 0) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 4bcfaca743..8f9275e4c9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:53:31 +DATE: 2024-05-16_14:32:11 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.335614e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.106638e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.335347e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.505437 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.832141e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.963737e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.194654e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.663591 sec INFO: No Floating Point Exceptions have been reported - 1,351,686,749 cycles:u # 2.532 GHz (73.05%) - 2,234,086 stalled-cycles-frontend:u # 0.17% frontend cycles idle (73.19%) - 5,889,218 stalled-cycles-backend:u # 0.44% backend cycles idle (74.89%) - 2,060,192,671 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (75.94%) - 0.578165423 seconds time elapsed + 2,552,888,166 cycles # 2.852 GHz + 3,988,214,096 instructions # 1.56 insn per cycle + 0.955666761 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.240241e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415912e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415912e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.805075 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.053499e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.235255e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.235255e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.389071 sec INFO: No Floating Point Exceptions have been reported - 19,600,879,999 cycles:u # 3.372 GHz (74.93%) - 50,975,682 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.02%) - 61,790,125 stalled-cycles-backend:u # 0.32% backend cycles idle (75.01%) - 46,990,521,493 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 5.820541450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) + 18,286,976,651 cycles # 2.860 GHz + 43,966,894,713 instructions # 2.40 insn per cycle + 6.394304827 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.927481e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425900e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425900e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.992293 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.575090e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.062276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.062276e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.417510 sec INFO: No Floating Point Exceptions have been reported - 13,244,208,331 cycles:u # 3.309 GHz (75.01%) - 51,613,494 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) - 1,582,031,183 stalled-cycles-backend:u # 11.95% backend cycles idle (75.02%) - 31,158,458,120 instructions:u # 2.35 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 4.006583677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) + 12,744,894,916 cycles # 2.883 GHz + 31,001,019,523 instructions # 2.43 insn per cycle + 4.422588286 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.643596e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.529991e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.529991e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.106384 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.946862e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.717949e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.717949e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.650052 sec +INFO: No Floating Point Exceptions have been reported + 10,049,299,034 cycles # 2.750 GHz + 19,366,983,583 instructions # 1.93 insn per cycle + 3.655131055 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.022084e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.853820e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.853820e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.526182 sec INFO: No Floating Point Exceptions have been reported - 10,162,252,294 cycles:u # 3.260 GHz (74.93%) - 48,687,080 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.06%) - 385,968,837 stalled-cycles-backend:u # 3.80% backend cycles idle (75.11%) - 19,353,357,901 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (75.11%) - 3.121076222 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) + 9,727,313,452 cycles # 2.755 GHz + 18,976,774,064 instructions # 1.95 insn per cycle + 3.531366474 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.695166e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.233065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.233065e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.130967 sec +INFO: No Floating Point Exceptions have been reported + 8,580,931,991 cycles # 2.075 GHz + 15,727,945,386 instructions # 1.83 insn per cycle + 4.136130895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index bbb46d465f..556a164c58 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,182 +1,231 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:23:42 +DATE: 2024-05-16_14:59:51 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.504140e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.315586e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.315586e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.526875 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.482485e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.592798e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592798e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.321030 sec INFO: No Floating Point Exceptions have been reported - 18,387,040,799 cycles:u # 3.306 GHz (74.99%) - 219,803,785 stalled-cycles-frontend:u # 1.20% frontend cycles idle (75.01%) - 6,863,615,759 stalled-cycles-backend:u # 37.33% backend cycles idle (74.99%) - 17,261,403,497 instructions:u # 0.94 insn per cycle - # 0.40 stalled cycles per insn (74.97%) - 5.586710375 seconds time elapsed + 7,329,722,152 cycles # 2.840 GHz + 13,178,162,400 instructions # 1.80 insn per cycle + 2.637544426 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.226101e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.395316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.395316e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.960017 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.015910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.185158e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.185158e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.819356 sec INFO: No Floating Point Exceptions have been reported - 19,972,737,786 cycles:u # 3.340 GHz (74.99%) - 53,101,436 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.99%) - 110,369,686 stalled-cycles-backend:u # 0.55% backend cycles idle (74.99%) - 47,264,809,914 instructions:u # 2.37 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 5.984037268 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) + 19,469,024,508 cycles # 2.853 GHz + 44,194,459,972 instructions # 2.27 insn per cycle + 6.826135735 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.873295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.344627e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.344627e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.220034 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.484528e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.911785e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911785e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.870031 sec INFO: No Floating Point Exceptions have been reported - 13,886,084,795 cycles:u # 3.276 GHz (74.90%) - 54,329,570 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.94%) - 1,624,045,452 stalled-cycles-backend:u # 11.70% backend cycles idle (75.03%) - 31,905,363,866 instructions:u # 2.30 insn per cycle - # 0.05 stalled cycles per insn (75.09%) - 4.243404225 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) + 13,928,289,994 cycles # 2.857 GHz + 31,840,505,402 instructions # 2.29 insn per cycle + 4.876819018 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.545906e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.342715e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.342715e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.341813 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.770728e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.386706e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.386706e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.194561 sec INFO: No Floating Point Exceptions have been reported - 10,768,607,979 cycles:u # 3.204 GHz (75.03%) - 51,098,851 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.01%) - 386,779,720 stalled-cycles-backend:u # 3.59% backend cycles idle (74.90%) - 20,765,206,333 instructions:u # 1.93 insn per cycle - # 0.02 stalled cycles per insn (74.90%) - 3.364940030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) + 11,361,909,372 cycles # 2.705 GHz + 20,728,193,515 instructions # 1.82 insn per cycle + 4.201564491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.837895e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.511052e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.511052e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.060412 sec +INFO: No Floating Point Exceptions have been reported + 10,967,372,142 cycles # 2.697 GHz + 20,348,024,135 instructions # 1.86 insn per cycle + 4.067336299 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.574771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.036722e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036722e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.642745 sec +INFO: No Floating Point Exceptions have been reported + 9,876,735,144 cycles # 2.125 GHz + 16,873,564,045 instructions # 1.71 insn per cycle + 4.649693422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 079279b6ae..752636bf13 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:30:49 +DATE: 2024-05-16_15:10:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.261896e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.101320e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.330734e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.514734e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.592124e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.118434e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.650614 sec +TOTAL : 1.385277 sec INFO: No Floating Point Exceptions have been reported - 15,390,822,100 cycles:u # 3.291 GHz (74.96%) - 154,767,236 stalled-cycles-frontend:u # 1.01% frontend cycles idle (74.96%) - 6,896,164,043 stalled-cycles-backend:u # 44.81% backend cycles idle (75.04%) - 11,598,353,813 instructions:u # 0.75 insn per cycle - # 0.59 stalled cycles per insn (75.02%) - 4.701228878 seconds time elapsed + 4,585,820,337 cycles # 2.836 GHz + 7,177,605,134 instructions # 1.57 insn per cycle + 1.675534023 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.241924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.415026e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.415026e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.053610e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.236408e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.236408e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.784111 sec +TOTAL : 6.764205 sec INFO: No Floating Point Exceptions have been reported - 19,582,424,359 cycles:u # 3.380 GHz (75.01%) - 52,626,328 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.01%) - 62,580,532 stalled-cycles-backend:u # 0.32% backend cycles idle (75.01%) - 47,002,468,034 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 5.794589010 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) + 19,353,708,257 cycles # 2.859 GHz + 44,070,957,602 instructions # 2.28 insn per cycle + 6.769682162 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.924408e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.423249e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.423249e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.538542e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.015186e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015186e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.991413 sec +TOTAL : 4.882143 sec INFO: No Floating Point Exceptions have been reported - 13,266,186,109 cycles:u # 3.315 GHz (75.01%) - 52,004,169 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.01%) - 1,576,241,201 stalled-cycles-backend:u # 11.88% backend cycles idle (75.01%) - 31,161,679,732 instructions:u # 2.35 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 4.003012510 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) + 13,858,733,837 cycles # 2.836 GHz + 31,001,638,282 instructions # 2.24 insn per cycle + 4.887574523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.657137e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.541237e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541237e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.916460e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.668694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.668694e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.073472 sec +INFO: No Floating Point Exceptions have been reported + 11,156,313,503 cycles # 2.736 GHz + 19,267,334,271 instructions # 1.73 insn per cycle + 4.078862770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.001009e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.815270e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.815270e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.092372 sec +TOTAL : 3.939832 sec INFO: No Floating Point Exceptions have been reported - 10,117,398,066 cycles:u # 3.262 GHz (74.99%) - 49,699,237 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.98%) - 408,670,593 stalled-cycles-backend:u # 4.04% backend cycles idle (74.98%) - 19,310,695,694 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 3.103403972 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) + 10,815,664,939 cycles # 2.742 GHz + 18,691,798,772 instructions # 1.73 insn per cycle + 3.945208768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.659550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.188571e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.188571e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.600299 sec +INFO: No Floating Point Exceptions have been reported + 9,711,022,403 cycles # 2.112 GHz + 15,432,876,214 instructions # 1.59 insn per cycle + 4.605929662 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 6d168a519f..55f8e65d60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -40,175 +40,175 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_09:26:00 +DATE: 2024-05-16_15:08:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.533170e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.604050e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.151992e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.528786e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.598045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.123971e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.007981 sec +TOTAL : 1.009724 sec INFO: No Floating Point Exceptions have been reported - 3,512,519,696 cycles # 2.832 GHz - 7,067,616,019 instructions # 2.01 insn per cycle - 1.297359256 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst + 3,524,520,771 cycles # 2.832 GHz + 6,986,852,382 instructions # 1.98 insn per cycle + 1.303213412 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 Avg ME (F77/GPU) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.055935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.238269e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238269e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.055666e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238246e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238246e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.378819 sec +TOTAL : 6.381413 sec INFO: No Floating Point Exceptions have been reported - 18,256,936,501 cycles # 2.860 GHz - 43,967,132,932 instructions # 2.41 insn per cycle - 6.384352391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 18,267,573,257 cycles # 2.861 GHz + 43,966,026,516 instructions # 2.41 insn per cycle + 6.386884750 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.561076e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.041870e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.041870e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.556382e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.037173e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.037173e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.456789 sec +TOTAL : 4.468246 sec INFO: No Floating Point Exceptions have been reported - 12,749,121,543 cycles # 2.858 GHz - 30,998,815,432 instructions # 2.43 insn per cycle - 4.462190413 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 12,780,247,489 cycles # 2.858 GHz + 30,998,946,765 instructions # 2.43 insn per cycle + 4.473714210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.929635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687321e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687321e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920501e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.673029e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.673029e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.680933 sec +TOTAL : 3.698478 sec INFO: No Floating Point Exceptions have been reported - 10,043,859,163 cycles # 2.725 GHz - 19,365,869,375 instructions # 1.93 insn per cycle - 3.686641064 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,077,708,073 cycles # 2.723 GHz + 19,366,955,499 instructions # 1.92 insn per cycle + 3.704491612 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003881e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819685e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819685e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.006263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.826924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.826924e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.556980 sec +TOTAL : 3.554344 sec INFO: No Floating Point Exceptions have been reported - 9,707,455,839 cycles # 2.726 GHz - 18,987,230,579 instructions # 1.96 insn per cycle - 3.562346845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 9,708,097,650 cycles # 2.728 GHz + 18,987,540,468 instructions # 1.96 insn per cycle + 3.559725957 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.666494e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.194230e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.194230e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.673906e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201263e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201263e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.197918 sec +TOTAL : 4.180852 sec INFO: No Floating Point Exceptions have been reported - 8,600,797,083 cycles # 2.047 GHz - 15,729,794,856 instructions # 1.83 insn per cycle - 4.203466995 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,572,399,884 cycles # 2.048 GHz + 15,727,509,673 instructions # 1.83 insn per cycle + 4.186463799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 2e6e4bd7b0..8320028620 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,176 +1,220 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:28:39 +DATE: 2024-05-16_15:05:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.475890e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.078088e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.308113e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.374542 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.845001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.545626e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.012755e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.945484 sec INFO: No Floating Point Exceptions have been reported - 17,912,998,427 cycles:u # 3.319 GHz (74.96%) - 220,020,020 stalled-cycles-frontend:u # 1.23% frontend cycles idle (74.94%) - 6,834,525,299 stalled-cycles-backend:u # 38.15% backend cycles idle (74.97%) - 16,837,830,655 instructions:u # 0.94 insn per cycle - # 0.41 stalled cycles per insn (75.01%) - 5.425169821 seconds time elapsed + 6,182,952,030 cycles # 2.840 GHz + 11,472,065,600 instructions # 1.86 insn per cycle + 2.234065267 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.240698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.414463e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.414463e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.790648 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.053191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.234940e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.234940e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.394281 sec INFO: No Floating Point Exceptions have been reported - 19,615,250,530 cycles:u # 3.382 GHz (74.95%) - 53,090,533 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.02%) - 60,472,852 stalled-cycles-backend:u # 0.31% backend cycles idle (75.03%) - 46,945,742,677 instructions:u # 2.39 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.801228335 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) + 18,294,474,856 cycles # 2.859 GHz + 43,971,000,114 instructions # 2.40 insn per cycle + 6.399562206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.932274e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.434949e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.434949e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.976944 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.557791e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040989e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040989e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.463787 sec INFO: No Floating Point Exceptions have been reported - 13,228,523,302 cycles:u # 3.319 GHz (74.91%) - 52,456,504 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.92%) - 1,573,426,456 stalled-cycles-backend:u # 11.89% backend cycles idle (74.92%) - 31,201,132,149 instructions:u # 2.36 insn per cycle - # 0.05 stalled cycles per insn (74.97%) - 3.987491718 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) + 12,772,778,524 cycles # 2.859 GHz + 30,998,712,334 instructions # 2.43 insn per cycle + 4.469196075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.649310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.519301e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.107454 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.925684e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.680642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.680642e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.688587 sec +INFO: No Floating Point Exceptions have been reported + 10,072,112,495 cycles # 2.727 GHz + 19,365,616,714 instructions # 1.92 insn per cycle + 3.694022814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.999182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.825411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.825411e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.565445 sec INFO: No Floating Point Exceptions have been reported - 10,115,652,915 cycles:u # 3.247 GHz (74.88%) - 50,306,144 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.90%) - 372,482,324 stalled-cycles-backend:u # 3.68% backend cycles idle (74.92%) - 19,309,824,678 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 3.120959363 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) + 9,737,532,909 cycles # 2.728 GHz + 18,976,607,709 instructions # 1.95 insn per cycle + 3.570830090 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.671566e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.197159e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197159e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.185653 sec +INFO: No Floating Point Exceptions have been reported + 8,570,262,444 cycles # 2.045 GHz + 15,727,819,138 instructions # 1.84 insn per cycle + 4.191109662 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 6a5d94131d..6aa3de3ecf 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:53:50 +DATE: 2024-05-16_14:32:42 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.893824e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.586353e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.907783e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.481272 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.832072e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.951586e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.177922e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.666506 sec INFO: No Floating Point Exceptions have been reported - 1,265,529,795 cycles:u # 2.501 GHz (74.81%) - 2,351,281 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.92%) - 5,913,872 stalled-cycles-backend:u # 0.47% backend cycles idle (74.84%) - 2,092,479,663 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (74.88%) - 0.532629891 seconds time elapsed + 2,570,661,306 cycles # 2.841 GHz + 3,994,547,928 instructions # 1.55 insn per cycle + 0.967516454 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.323490e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.523841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.523841e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.478000 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.115974e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322257e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322257e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.054676 sec INFO: No Floating Point Exceptions have been reported - 18,489,189,686 cycles:u # 3.368 GHz (74.94%) - 53,032,744 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.94%) - 829,160,138 stalled-cycles-backend:u # 4.48% backend cycles idle (74.94%) - 44,803,064,031 instructions:u # 2.42 insn per cycle - # 0.02 stalled cycles per insn (75.01%) - 5.492850186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) + 17,515,565,744 cycles # 2.891 GHz + 41,813,477,100 instructions # 2.39 insn per cycle + 6.059803806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.979258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.524530e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.524530e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.908576 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.620797e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.138518e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.138518e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.301844 sec INFO: No Floating Point Exceptions have been reported - 13,000,447,782 cycles:u # 3.317 GHz (74.92%) - 55,760,917 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.90%) - 1,631,053,608 stalled-cycles-backend:u # 12.55% backend cycles idle (74.92%) - 30,005,481,698 instructions:u # 2.31 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 3.934209277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) + 12,450,766,554 cycles # 2.891 GHz + 30,161,114,565 instructions # 2.42 insn per cycle + 4.307292943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.605362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.438599e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.438599e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.137948 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.953305e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.731201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.731201e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.641578 sec +INFO: No Floating Point Exceptions have been reported + 9,958,194,708 cycles # 2.732 GHz + 19,097,340,022 instructions # 1.92 insn per cycle + 3.646748223 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.030144e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.871070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.871070e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.516336 sec INFO: No Floating Point Exceptions have been reported - 10,337,671,349 cycles:u # 3.283 GHz (74.86%) - 53,779,157 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.89%) - 332,090,936 stalled-cycles-backend:u # 3.21% backend cycles idle (75.02%) - 18,952,579,151 instructions:u # 1.83 insn per cycle - # 0.02 stalled cycles per insn (75.10%) - 3.152056886 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) + 9,615,329,857 cycles # 2.731 GHz + 18,757,197,169 instructions # 1.95 insn per cycle + 3.521486960 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.716452e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.276447e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.276447e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.085042 sec +INFO: No Floating Point Exceptions have been reported + 8,419,965,935 cycles # 2.059 GHz + 15,604,092,420 instructions # 1.85 insn per cycle + 4.090208290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 77b1a6a7a9..b8b45776b1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:11:59 +DATE: 2024-05-16_14:50:17 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.335138e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.114607e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.343263e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.492323 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.702175e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.710921e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156854e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.694390 sec INFO: No Floating Point Exceptions have been reported - 1,312,289,309 cycles:u # 2.539 GHz (74.68%) - 2,303,283 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.68%) - 5,720,150 stalled-cycles-backend:u # 0.44% backend cycles idle (75.59%) - 2,095,079,145 instructions:u # 1.60 insn per cycle - # 0.00 stalled cycles per insn (74.41%) - 0.543465242 seconds time elapsed + 2,605,646,002 cycles # 2.805 GHz + 4,054,878,448 instructions # 1.56 insn per cycle + 0.990293563 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.790759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.180659e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.180659e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.236145 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.570616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.013848e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.013848e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.427060 sec INFO: No Floating Point Exceptions have been reported - 14,126,513,604 cycles:u # 3.327 GHz (74.95%) - 52,243,783 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.94%) - 2,149,229,322 stalled-cycles-backend:u # 15.21% backend cycles idle (74.95%) - 36,817,782,552 instructions:u # 2.61 insn per cycle - # 0.06 stalled cycles per insn (74.95%) - 4.249944382 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) + 12,653,335,495 cycles # 2.855 GHz + 32,508,582,789 instructions # 2.57 insn per cycle + 4.432506316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.407363e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.241561e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.241561e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.333377 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.997942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.867798e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.867798e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.572733 sec INFO: No Floating Point Exceptions have been reported - 10,975,506,316 cycles:u # 3.283 GHz (74.89%) - 52,565,684 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.92%) - 978,456,899 stalled-cycles-backend:u # 8.91% backend cycles idle (75.04%) - 24,693,614,092 instructions:u # 2.25 insn per cycle - # 0.04 stalled cycles per insn (75.12%) - 3.347285273 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) + 10,224,022,422 cycles # 2.858 GHz + 24,474,305,392 instructions # 2.39 insn per cycle + 3.578147466 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.027607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.213793e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.213793e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.800855 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.166030e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.174429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.174429e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.325864 sec +INFO: No Floating Point Exceptions have been reported + 9,098,194,590 cycles # 2.732 GHz + 16,922,780,551 instructions # 1.86 insn per cycle + 3.331363940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.212975e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.265291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265291e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.263243 sec INFO: No Floating Point Exceptions have been reported - 9,080,636,256 cycles:u # 3.230 GHz (74.98%) - 49,418,393 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.96%) - 106,768,871 stalled-cycles-backend:u # 1.18% backend cycles idle (74.96%) - 16,891,831,755 instructions:u # 1.86 insn per cycle - # 0.01 stalled cycles per insn (74.97%) - 2.815004548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) + 8,899,581,855 cycles # 2.723 GHz + 16,332,700,862 instructions # 1.84 insn per cycle + 3.268811314 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.881513e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.573087e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573087e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.766130 sec +INFO: No Floating Point Exceptions have been reported + 7,870,004,063 cycles # 2.087 GHz + 14,582,523,760 instructions # 1.85 insn per cycle + 3.771527980 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index a7320ea899..36ca3a055a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:12:14 +DATE: 2024-05-16_14:50:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.927470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.583386e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.904218e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.493515 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.703728e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.728666e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.213805e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.683308 sec INFO: No Floating Point Exceptions have been reported - 1,269,039,139 cycles:u # 2.526 GHz (74.60%) - 2,146,758 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.86%) - 5,981,286 stalled-cycles-backend:u # 0.47% backend cycles idle (74.86%) - 1,987,275,013 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (74.31%) - 0.544769433 seconds time elapsed + 2,600,218,993 cycles # 2.827 GHz + 4,020,842,023 instructions # 1.55 insn per cycle + 0.979103636 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/GPU) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.441579e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.231307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.231307e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.298601 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.089739e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.950650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.950650e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.429960 sec INFO: No Floating Point Exceptions have been reported - 10,867,783,594 cycles:u # 3.284 GHz (74.87%) - 52,146,672 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.88%) - 50,308,407 stalled-cycles-backend:u # 0.46% backend cycles idle (75.00%) - 28,361,487,036 instructions:u # 2.61 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 3.312935168 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) + 9,811,818,087 cycles # 2.857 GHz + 25,388,363,151 instructions # 2.59 insn per cycle + 3.435408237 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.645291e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.690258e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.690258e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.098793 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.308707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576242e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576242e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.150608 sec INFO: No Floating Point Exceptions have been reported - 10,143,082,072 cycles:u # 3.263 GHz (75.04%) - 52,117,387 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.04%) - 57,943,710 stalled-cycles-backend:u # 0.57% backend cycles idle (75.04%) - 21,538,603,189 instructions:u # 2.12 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 3.112605117 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) + 9,017,040,298 cycles # 2.858 GHz + 21,483,572,468 instructions # 2.38 insn per cycle + 3.156151233 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.299536e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779460e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779460e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.628612 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.317514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502934e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.502934e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.137476 sec +INFO: No Floating Point Exceptions have been reported + 8,591,075,632 cycles # 2.734 GHz + 15,811,134,800 instructions # 1.84 insn per cycle + 3.143116597 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.364262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.604788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.604788e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.081738 sec INFO: No Floating Point Exceptions have been reported - 8,508,384,505 cycles:u # 3.225 GHz (74.85%) - 48,598,729 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.85%) - 67,466,002 stalled-cycles-backend:u # 0.79% backend cycles idle (74.95%) - 15,817,480,134 instructions:u # 1.86 insn per cycle - # 0.00 stalled cycles per insn (75.11%) - 2.642397740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) + 8,442,051,612 cycles # 2.735 GHz + 15,504,513,991 instructions # 1.84 insn per cycle + 3.087247738 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165084E-002 -Relative difference = 1.0277089582483854e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.995819e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.803385e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.803385e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.575599 sec +INFO: No Floating Point Exceptions have been reported + 7,560,717,738 cycles # 2.112 GHz + 14,283,918,013 instructions # 1.89 insn per cycle + 3.581217674 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 14881e21bf..e6a48e18ea 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:54:07 +DATE: 2024-05-16_14:33:12 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.883918e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.052029e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.769187e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.374784 sec +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.602023e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319974e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.288580e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.573413 sec INFO: No Floating Point Exceptions have been reported - 960,879,210 cycles:u # 2.421 GHz (72.75%) - 2,175,480 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.37%) - 5,482,826 stalled-cycles-backend:u # 0.57% backend cycles idle (75.75%) - 1,732,976,834 instructions:u # 1.80 insn per cycle - # 0.00 stalled cycles per insn (75.91%) - 0.424663664 seconds time elapsed + 2,248,961,780 cycles # 2.823 GHz + 3,510,545,687 instructions # 1.56 insn per cycle + 0.854969378 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.423076e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.652500e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.652500e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.093272 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.082006e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.283429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.283429e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.191643 sec INFO: No Floating Point Exceptions have been reported - 17,225,516,490 cycles:u # 3.378 GHz (74.92%) - 40,407,321 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.93%) - 34,038,801 stalled-cycles-backend:u # 0.20% backend cycles idle (75.00%) - 47,163,828,021 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 5.104691057 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) + 17,740,084,916 cycles # 2.864 GHz + 43,510,870,904 instructions # 2.45 insn per cycle + 6.196630799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.926509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.144859e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.144859e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.819466 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.223628e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.400190e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400190e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.211334 sec INFO: No Floating Point Exceptions have been reported - 9,257,870,999 cycles:u # 3.275 GHz (74.83%) - 41,348,671 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.94%) - 979,441,310 stalled-cycles-backend:u # 10.58% backend cycles idle (75.08%) - 22,116,917,006 instructions:u # 2.39 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 2.833251666 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) + 9,244,093,548 cycles # 2.875 GHz + 21,907,620,538 instructions # 2.37 insn per cycle + 3.216400901 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.427502e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.028137e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.028137e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.505394 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.419643e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.716778e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716778e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.967957 sec INFO: No Floating Point Exceptions have been reported - 8,135,282,485 cycles:u # 3.238 GHz (74.88%) - 42,554,369 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.70%) - 1,560,275,251 stalled-cycles-backend:u # 19.18% backend cycles idle (74.83%) - 15,520,980,931 instructions:u # 1.91 insn per cycle - # 0.10 stalled cycles per insn (75.14%) - 2.516660422 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) + 8,316,472,651 cycles # 2.798 GHz + 15,592,546,873 instructions # 1.87 insn per cycle + 2.972957823 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.415944e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.716005e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716005e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.973842 sec +INFO: No Floating Point Exceptions have been reported + 8,315,082,395 cycles # 2.793 GHz + 15,436,266,122 instructions # 1.86 insn per cycle + 2.978955673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.416813e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.677411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.677411e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.974949 sec +INFO: No Floating Point Exceptions have been reported + 6,610,937,423 cycles # 2.219 GHz + 12,863,752,208 instructions # 1.95 insn per cycle + 2.980091060 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 9830bd7b4e..29c604a610 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,182 +1,231 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:24:05 +DATE: 2024-05-16_15:00:26 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.599503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306620e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.306620e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.345325 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.941162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.420865e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.420865e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.745821 sec INFO: No Floating Point Exceptions have been reported - 17,776,352,111 cycles:u # 3.308 GHz (74.99%) - 119,250,397 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.00%) - 6,849,788,457 stalled-cycles-backend:u # 38.53% backend cycles idle (74.99%) - 17,105,198,140 instructions:u # 0.96 insn per cycle - # 0.40 stalled cycles per insn (74.99%) - 5.402378584 seconds time elapsed + 5,608,708,868 cycles # 2.838 GHz + 10,190,752,473 instructions # 1.82 insn per cycle + 2.033892232 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.412731e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.638373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.638373e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.185885 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.055357e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.251163e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.251163e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.442554 sec INFO: No Floating Point Exceptions have been reported - 17,446,020,529 cycles:u # 3.356 GHz (74.97%) - 39,837,031 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.93%) - 78,482,274 stalled-cycles-backend:u # 0.45% backend cycles idle (74.92%) - 47,339,513,994 instructions:u # 2.71 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 5.202106716 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) + 18,431,262,224 cycles # 2.859 GHz + 43,659,496,470 instructions # 2.37 insn per cycle + 6.448903506 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.868967e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.038264e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.038264e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.950420 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.110619e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.160777e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.160777e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.492633 sec INFO: No Floating Point Exceptions have been reported - 9,628,487,073 cycles:u # 3.250 GHz (74.72%) - 42,300,894 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.77%) - 988,623,281 stalled-cycles-backend:u # 10.27% backend cycles idle (74.96%) - 23,280,735,842 instructions:u # 2.42 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 2.966852201 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) + 9,993,500,583 cycles # 2.857 GHz + 23,243,476,984 instructions # 2.33 insn per cycle + 3.498991107 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.333903e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.830919e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.830919e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.635881 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.275264e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.392372e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.392372e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.272099 sec INFO: No Floating Point Exceptions have been reported - 8,486,150,079 cycles:u # 3.204 GHz (74.99%) - 42,761,185 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.94%) - 1,569,530,324 stalled-cycles-backend:u # 18.50% backend cycles idle (74.79%) - 16,678,247,062 instructions:u # 1.97 insn per cycle - # 0.09 stalled cycles per insn (74.79%) - 2.652887678 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) + 9,005,707,266 cycles # 2.748 GHz + 16,711,349,389 instructions # 1.86 insn per cycle + 3.278668519 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.297831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.443567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.443567e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.244587 sec +INFO: No Floating Point Exceptions have been reported + 8,928,752,660 cycles # 2.747 GHz + 16,549,135,089 instructions # 1.85 insn per cycle + 3.250993607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.241559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306249e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306249e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.324488 sec +INFO: No Floating Point Exceptions have been reported + 7,378,511,382 cycles # 2.216 GHz + 14,071,008,703 instructions # 1.91 insn per cycle + 3.330848983 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index b6c599a5f7..8016aaf3c8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:31:12 +DATE: 2024-05-16_15:11:30 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.831561e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.040681e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.755363e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.535946 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.314779e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.179276e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254245e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.213916 sec INFO: No Floating Point Exceptions have been reported - 15,029,031,453 cycles:u # 3.296 GHz (74.97%) - 53,464,566 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.00%) - 6,828,705,816 stalled-cycles-backend:u # 45.44% backend cycles idle (74.98%) - 11,270,019,199 instructions:u # 0.75 insn per cycle - # 0.61 stalled cycles per insn (75.08%) - 4.585299662 seconds time elapsed + 4,089,914,869 cycles # 2.847 GHz + 6,594,462,327 instructions # 1.61 insn per cycle + 1.494122889 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.422756e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.651467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.651467e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.089350e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.292539e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.292539e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.093908 sec +TOTAL : 6.490729 sec INFO: No Floating Point Exceptions have been reported - 17,247,361,054 cycles:u # 3.382 GHz (74.83%) - 40,283,734 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.91%) - 37,286,054 stalled-cycles-backend:u # 0.22% backend cycles idle (75.06%) - 47,177,568,793 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 5.101990445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) + 18,740,706,935 cycles # 2.886 GHz + 43,689,321,367 instructions # 2.33 insn per cycle + 6.495941000 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.917329e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.131879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.131879e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.213519e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.407340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.407340e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.823566 sec +TOTAL : 3.565494 sec INFO: No Floating Point Exceptions have been reported - 9,262,737,833 cycles:u # 3.273 GHz (74.85%) - 41,466,075 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.84%) - 987,134,435 stalled-cycles-backend:u # 10.66% backend cycles idle (74.95%) - 22,124,170,980 instructions:u # 2.39 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 2.831331995 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) + 10,288,737,724 cycles # 2.883 GHz + 21,988,558,280 instructions # 2.14 insn per cycle + 3.570732391 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.427477e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.023879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.023879e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.392103e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.656361e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.656361e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.347045 sec +INFO: No Floating Point Exceptions have been reported + 9,294,224,919 cycles # 2.774 GHz + 15,502,535,760 instructions # 1.67 insn per cycle + 3.352354405 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.411853e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.706271e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.706271e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.500193 sec +TOTAL : 3.334245 sec INFO: No Floating Point Exceptions have been reported - 8,158,770,517 cycles:u # 3.255 GHz (74.80%) - 42,309,026 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.92%) - 1,542,393,291 stalled-cycles-backend:u # 18.90% backend cycles idle (75.06%) - 15,482,682,988 instructions:u # 1.90 insn per cycle - # 0.10 stalled cycles per insn (75.11%) - 2.508023648 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) + 9,229,877,586 cycles # 2.765 GHz + 15,144,508,612 instructions # 1.64 insn per cycle + 3.339505215 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.389642e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623022e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623022e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.358884 sec +INFO: No Floating Point Exceptions have been reported + 7,623,474,420 cycles # 2.266 GHz + 12,573,351,599 instructions # 1.65 insn per cycle + 3.364654068 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 1737a62fd3..3bd2ee01ac 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -40,175 +40,175 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_09:26:31 +DATE: 2024-05-16_15:08:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.318359e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192916e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.284003e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.323461e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.185145e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269757e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.876266 sec +TOTAL : 0.877821 sec INFO: No Floating Point Exceptions have been reported - 3,110,258,101 cycles # 2.830 GHz - 6,386,329,379 instructions # 2.05 insn per cycle - 1.155797878 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst + 3,113,911,295 cycles # 2.829 GHz + 6,352,740,713 instructions # 2.04 insn per cycle + 1.157340966 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 Avg ME (F77/GPU) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.080174e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.280907e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280907e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.080933e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281840e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.203279 sec +TOTAL : 6.198018 sec INFO: No Floating Point Exceptions have been reported - 17,751,642,679 cycles # 2.860 GHz - 43,511,106,202 instructions # 2.45 insn per cycle - 6.208474005 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 17,744,200,571 cycles # 2.861 GHz + 43,507,633,337 instructions # 2.45 insn per cycle + 6.203254296 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.214412e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.383137e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.383137e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.206413e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.379725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.225748 sec +TOTAL : 3.236598 sec INFO: No Floating Point Exceptions have been reported - 9,232,921,103 cycles # 2.858 GHz - 21,907,138,182 instructions # 2.37 insn per cycle - 3.231059357 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,264,626,353 cycles # 2.859 GHz + 21,909,129,569 instructions # 2.36 insn per cycle + 3.241826343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.380021e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.627341e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.627341e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.378945e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.622195e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.622195e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.019963 sec +TOTAL : 3.023073 sec INFO: No Floating Point Exceptions have been reported - 8,287,021,575 cycles # 2.740 GHz - 15,592,507,037 instructions # 1.88 insn per cycle - 3.025235012 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,285,944,372 cycles # 2.737 GHz + 15,591,046,995 instructions # 1.88 insn per cycle + 3.028351636 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.401224e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676523e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676523e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.406944e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.689990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.689990e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.998514 sec +TOTAL : 2.991499 sec INFO: No Floating Point Exceptions have been reported - 8,210,791,486 cycles # 2.734 GHz - 15,428,751,418 instructions # 1.88 insn per cycle - 3.003711640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,215,974,247 cycles # 2.742 GHz + 15,434,394,808 instructions # 1.88 insn per cycle + 2.996736921 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.356959e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.365448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582442e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582442e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.049340 sec +TOTAL : 3.039216 sec INFO: No Floating Point Exceptions have been reported - 6,623,797,614 cycles # 2.170 GHz - 12,865,182,847 instructions # 1.94 insn per cycle - 3.054557683 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) + 6,609,003,865 cycles # 2.172 GHz + 12,863,939,056 instructions # 1.95 insn per cycle + 3.044504803 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index fe6c7848ed..ef6806658f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,176 +1,220 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:29:01 +DATE: 2024-05-16_15:06:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.340281e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.884809e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.568324e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.258637 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.717298e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.145539e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.143407e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.533112 sec INFO: No Floating Point Exceptions have been reported - 17,563,321,942 cycles:u # 3.321 GHz (74.96%) - 119,585,531 stalled-cycles-frontend:u # 0.68% frontend cycles idle (75.04%) - 6,784,796,342 stalled-cycles-backend:u # 38.63% backend cycles idle (75.05%) - 16,735,211,123 instructions:u # 0.95 insn per cycle - # 0.41 stalled cycles per insn (75.05%) - 5.310494029 seconds time elapsed + 4,980,418,158 cycles # 2.839 GHz + 9,119,342,139 instructions # 1.83 insn per cycle + 1.812784805 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.426108e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655329e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.087357 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.069115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.265540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265540e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.262605 sec INFO: No Floating Point Exceptions have been reported - 17,204,587,803 cycles:u # 3.377 GHz (74.91%) - 39,355,756 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.98%) - 31,548,116 stalled-cycles-backend:u # 0.18% backend cycles idle (75.03%) - 47,153,071,449 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.096757669 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) + 17,921,464,120 cycles # 2.860 GHz + 43,508,155,770 instructions # 2.43 insn per cycle + 6.267871711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.923910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.140468e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.140468e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.823848 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.205993e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.384022e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.384022e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.237394 sec INFO: No Floating Point Exceptions have been reported - 9,247,785,083 cycles:u # 3.267 GHz (74.85%) - 40,833,202 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.86%) - 986,336,744 stalled-cycles-backend:u # 10.67% backend cycles idle (74.91%) - 22,151,028,754 instructions:u # 2.40 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 2.832758864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) + 9,270,351,681 cycles # 2.860 GHz + 21,907,147,046 instructions # 2.36 insn per cycle + 3.242634203 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.378898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.959224e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.959224e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.539209 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.379256e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625966e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625966e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.019248 sec +INFO: No Floating Point Exceptions have been reported + 8,297,873,717 cycles # 2.744 GHz + 15,590,905,283 instructions # 1.88 insn per cycle + 3.024598202 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.398987e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.671948e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.671948e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.000824 sec INFO: No Floating Point Exceptions have been reported - 8,186,960,394 cycles:u # 3.224 GHz (74.83%) - 42,758,994 stalled-cycles-frontend:u # 0.52% frontend cycles idle (75.12%) - 1,565,874,225 stalled-cycles-backend:u # 19.13% backend cycles idle (74.99%) - 15,488,647,093 instructions:u # 1.89 insn per cycle - # 0.10 stalled cycles per insn (75.03%) - 2.548349202 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) + 8,223,361,227 cycles # 2.736 GHz + 15,434,590,756 instructions # 1.88 insn per cycle + 3.006023707 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.360241e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570739e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570739e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.046199 sec +INFO: No Floating Point Exceptions have been reported + 6,615,724,908 cycles # 2.169 GHz + 12,863,710,849 instructions # 1.94 insn per cycle + 3.051492012 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index cba6a5b401..b613786442 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:54:22 +DATE: 2024-05-16_14:33:39 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.908413e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.092641e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.820730e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.374505 sec +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.604585e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.336536e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.343718e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.571057 sec INFO: No Floating Point Exceptions have been reported - 979,760,804 cycles:u # 2.467 GHz (74.38%) - 2,343,962 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.18%) - 4,434,800 stalled-cycles-backend:u # 0.45% backend cycles idle (74.38%) - 1,767,673,609 instructions:u # 1.80 insn per cycle - # 0.00 stalled cycles per insn (75.82%) - 0.424530447 seconds time elapsed + 2,242,744,669 cycles # 2.822 GHz + 3,531,920,926 instructions # 1.57 insn per cycle + 0.851832101 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.547638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.821232e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.821232e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.736828 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.153085e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.384746e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384746e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.830325 sec INFO: No Floating Point Exceptions have been reported - 15,985,012,934 cycles:u # 3.370 GHz (74.91%) - 39,859,736 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.98%) - 27,486,688 stalled-cycles-backend:u # 0.17% backend cycles idle (75.04%) - 43,996,654,019 instructions:u # 2.75 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 4.748066178 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) + 16,691,813,815 cycles # 2.861 GHz + 41,266,181,474 instructions # 2.47 insn per cycle + 5.835359179 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.009133e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.324751e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.324751e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.759400 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.304367e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587722e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587722e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.109672 sec INFO: No Floating Point Exceptions have been reported - 9,044,703,094 cycles:u # 3.269 GHz (74.79%) - 41,357,970 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.86%) - 883,301,243 stalled-cycles-backend:u # 9.77% backend cycles idle (74.95%) - 21,623,711,510 instructions:u # 2.39 insn per cycle - # 0.04 stalled cycles per insn (75.08%) - 2.770793180 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) + 8,995,426,679 cycles # 2.889 GHz + 21,211,089,826 instructions # 2.36 insn per cycle + 3.114839321 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.475002e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.128091e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.128091e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.479050 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.420820e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.716893e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716893e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.969606 sec INFO: No Floating Point Exceptions have been reported - 8,035,208,858 cycles:u # 3.232 GHz (74.94%) - 42,761,530 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.91%) - 1,619,377,735 stalled-cycles-backend:u # 20.15% backend cycles idle (74.90%) - 15,353,591,850 instructions:u # 1.91 insn per cycle - # 0.11 stalled cycles per insn (74.91%) - 2.490290485 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) + 8,272,952,138 cycles # 2.782 GHz + 15,425,102,157 instructions # 1.86 insn per cycle + 2.974640700 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.475869e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.854556e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.854556e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.912175 sec +INFO: No Floating Point Exceptions have been reported + 8,117,590,540 cycles # 2.783 GHz + 15,233,342,033 instructions # 1.88 insn per cycle + 2.917189383 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.412788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.668874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.668874e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.982256 sec +INFO: No Floating Point Exceptions have been reported + 6,592,409,084 cycles # 2.208 GHz + 12,843,659,599 instructions # 1.95 insn per cycle + 2.987368722 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052564145764E-002 +Relative difference = 1.9988585667912256e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 6bf576ec95..e6d46e5965 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:12:28 +DATE: 2024-05-16_14:51:09 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.882092e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.058393e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.778143e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.372693 sec +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.307514e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195447e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293637e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.586214 sec INFO: No Floating Point Exceptions have been reported - 985,461,621 cycles:u # 2.501 GHz (73.78%) - 2,165,098 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.93%) - 4,268,655 stalled-cycles-backend:u # 0.43% backend cycles idle (76.05%) - 1,822,873,609 instructions:u # 1.85 insn per cycle - # 0.00 stalled cycles per insn (76.40%) - 0.420386075 seconds time elapsed + 2,288,759,129 cycles # 2.822 GHz + 3,563,945,826 instructions # 1.56 insn per cycle + 0.869586754 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.936599e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.385861e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.385861e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.919551 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.594420e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.075669e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.075669e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.325231 sec INFO: No Floating Point Exceptions have been reported - 13,101,935,021 cycles:u # 3.337 GHz (74.94%) - 40,078,820 stalled-cycles-frontend:u # 0.31% frontend cycles idle (74.94%) - 1,941,296,293 stalled-cycles-backend:u # 14.82% backend cycles idle (74.95%) - 38,012,714,824 instructions:u # 2.90 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 3.930459501 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) + 12,164,411,956 cycles # 2.810 GHz + 32,427,707,417 instructions # 2.67 insn per cycle + 4.330470336 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039543819614E-002 -Relative difference = 3.5561191488957804e-08 +Avg ME (F77/C++) = 1.2828039840314887E-002 +Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.521499e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.447162e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.447162e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.454107 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.607791e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.429113e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.429113e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.796823 sec INFO: No Floating Point Exceptions have been reported - 7,947,464,831 cycles:u # 3.229 GHz (74.98%) - 41,781,985 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.97%) - 830,048,875 stalled-cycles-backend:u # 10.44% backend cycles idle (74.97%) - 18,665,388,472 instructions:u # 2.35 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 2.464960549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) + 8,009,111,157 cycles # 2.859 GHz + 18,657,618,704 instructions # 2.33 insn per cycle + 2.802139139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039283704129E-002 +Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.864776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.022871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.022871e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.295687 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.719759e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.472647e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.472647e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.690741 sec INFO: No Floating Point Exceptions have been reported - 7,410,311,667 cycles:u # 3.218 GHz (74.99%) - 43,632,900 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) - 940,477,748 stalled-cycles-backend:u # 12.69% backend cycles idle (74.99%) - 14,171,370,593 instructions:u # 1.91 insn per cycle - # 0.07 stalled cycles per insn (74.99%) - 2.306582436 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) + 7,416,864,109 cycles # 2.752 GHz + 14,251,974,045 instructions # 1.92 insn per cycle + 2.696083346 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053337216261E-002 -Relative difference = 2.601499261602198e-07 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.774877e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.630440e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.630440e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.645076 sec +INFO: No Floating Point Exceptions have been reported + 7,291,130,406 cycles # 2.752 GHz + 13,948,384,567 instructions # 1.91 insn per cycle + 2.650598467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.434115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.741003e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.741003e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.962177 sec +INFO: No Floating Point Exceptions have been reported + 6,479,327,720 cycles # 2.184 GHz + 13,423,401,797 instructions # 2.07 insn per cycle + 2.967420151 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052562326775E-002 +Relative difference = 1.997440588685788e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 136ba9f47b..1851f3246c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_16:12:42 +DATE: 2024-05-16_14:51:33 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.920430e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.096494e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.821155e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.370870 sec +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.308899e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200904e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329787e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.585570 sec INFO: No Floating Point Exceptions have been reported - 947,505,526 cycles:u # 2.410 GHz (75.24%) - 2,145,387 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.04%) - 5,170,736 stalled-cycles-backend:u # 0.55% backend cycles idle (75.27%) - 1,747,059,876 instructions:u # 1.84 insn per cycle - # 0.00 stalled cycles per insn (75.70%) - 0.421525177 seconds time elapsed + 2,293,480,451 cycles # 2.820 GHz + 3,552,176,680 instructions # 1.55 insn per cycle + 0.870100804 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282802e-02 -Avg ME (F77/GPU) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/GPU) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.684357e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.629799e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.629799e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.015849 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.129885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080551e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.330346 sec INFO: No Floating Point Exceptions have been reported - 9,922,200,438 cycles:u # 3.283 GHz (74.92%) - 39,686,948 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.86%) - 29,196,305 stalled-cycles-backend:u # 0.29% backend cycles idle (74.87%) - 28,597,933,127 instructions:u # 2.88 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 3.026797636 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) + 9,425,530,261 cycles # 2.826 GHz + 25,263,309,757 instructions # 2.68 insn per cycle + 3.335509619 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039838495897E-002 +Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.834973e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.307529e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.307529e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.308332 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.953227e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.493970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.493970e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.514743 sec INFO: No Floating Point Exceptions have been reported - 7,466,042,626 cycles:u # 3.224 GHz (74.84%) - 40,665,488 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.80%) - 30,876,480 stalled-cycles-backend:u # 0.41% backend cycles idle (74.93%) - 16,825,238,557 instructions:u # 2.25 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 2.319001698 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) + 7,199,101,915 cycles # 2.858 GHz + 16,870,111,415 instructions # 2.34 insn per cycle + 2.520226033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.072999e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.542142e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.542142e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.209399 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.869871e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.903620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.903620e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.568153 sec INFO: No Floating Point Exceptions have been reported - 7,133,699,605 cycles:u # 3.218 GHz (74.80%) - 41,255,792 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.94%) - 387,219,994 stalled-cycles-backend:u # 5.43% backend cycles idle (75.10%) - 13,573,007,463 instructions:u # 1.90 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 2.220337867 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) + 7,089,400,745 cycles # 2.756 GHz + 13,616,924,187 instructions # 1.92 insn per cycle + 2.573571442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053331759293E-002 -Relative difference = 2.597245327285885e-07 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.911116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034757e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034757e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.534380 sec +INFO: No Floating Point Exceptions have been reported + 7,042,060,221 cycles # 2.774 GHz + 13,426,671,587 instructions # 1.91 insn per cycle + 2.539847169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525985e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967300e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967300e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.871748 sec +INFO: No Floating Point Exceptions have been reported + 6,325,625,286 cycles # 2.199 GHz + 13,154,721,049 instructions # 2.08 insn per cycle + 2.877120825 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052536860923E-002 +Relative difference = 1.977588895209662e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index c283caeb95..b626a014f8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:54:37 +DATE: 2024-05-16_14:34:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.331470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.111559e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.340159e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.494120 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.830025e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.944832e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.160865e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.662041 sec INFO: No Floating Point Exceptions have been reported - 1,299,598,698 cycles:u # 2.495 GHz (75.10%) - 2,154,035 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.60%) - 5,793,746 stalled-cycles-backend:u # 0.45% backend cycles idle (75.60%) - 2,077,487,131 instructions:u # 1.60 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 0.549392203 seconds time elapsed + 2,559,219,510 cycles # 2.861 GHz + 3,969,506,530 instructions # 1.55 insn per cycle + 0.952802853 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039901590281E-002 -Relative difference = 7.67145406542181e-09 +Avg ME (F77/GPU) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.243362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.417639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.417639e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.783303 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.044346e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.221236e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221236e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.439706 sec INFO: No Floating Point Exceptions have been reported - 19,561,889,676 cycles:u # 3.376 GHz (75.01%) - 52,819,414 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.01%) - 61,840,098 stalled-cycles-backend:u # 0.32% backend cycles idle (75.01%) - 47,022,800,875 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 5.807726502 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) + 18,648,827,254 cycles # 2.894 GHz + 44,218,351,924 instructions # 2.37 insn per cycle + 6.444755062 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.973198e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.505235e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.505235e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.913375 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634240e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.158489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.158489e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.270020 sec INFO: No Floating Point Exceptions have been reported - 12,947,018,589 cycles:u # 3.299 GHz (74.97%) - 48,208,744 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.93%) - 922,788,709 stalled-cycles-backend:u # 7.13% backend cycles idle (74.93%) - 31,064,474,051 instructions:u # 2.40 insn per cycle - # 0.03 stalled cycles per insn (74.94%) - 3.946281546 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) + 12,337,216,169 cycles # 2.886 GHz + 30,918,100,190 instructions # 2.51 insn per cycle + 4.275170664 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.590540e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.424811e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.424811e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.152600 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.943703e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.696046e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.696046e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.651391 sec +INFO: No Floating Point Exceptions have been reported + 10,097,284,751 cycles # 2.762 GHz + 19,374,074,587 instructions # 1.92 insn per cycle + 3.656592402 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.039225e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.880994e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.880994e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.498933 sec INFO: No Floating Point Exceptions have been reported - 10,294,139,229 cycles:u # 3.254 GHz (74.99%) - 49,396,962 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.96%) - 918,821,242 stalled-cycles-backend:u # 8.93% backend cycles idle (74.96%) - 19,581,766,584 instructions:u # 1.90 insn per cycle - # 0.05 stalled cycles per insn (74.97%) - 3.167170057 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) + 9,699,890,764 cycles # 2.769 GHz + 18,944,296,026 instructions # 1.95 insn per cycle + 3.504313379 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.766168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359450e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359450e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.977432 sec +INFO: No Floating Point Exceptions have been reported + 8,362,626,878 cycles # 2.101 GHz + 15,058,722,791 instructions # 1.80 insn per cycle + 3.982532855 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 782ac7bf50..f9780717c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-05-15_15:54:55 +DATE: 2024-05-16_14:34:35 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.919863e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.585304e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.905943e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.480237 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.831074e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.944999e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.163112e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.663702 sec INFO: No Floating Point Exceptions have been reported - 1,287,175,475 cycles:u # 2.547 GHz (74.78%) - 2,217,592 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.01%) - 5,481,156 stalled-cycles-backend:u # 0.43% backend cycles idle (74.96%) - 2,029,305,527 instructions:u # 1.58 insn per cycle - # 0.00 stalled cycles per insn (74.73%) - 0.532727801 seconds time elapsed + 2,550,713,530 cycles # 2.845 GHz + 3,995,712,636 instructions # 1.57 insn per cycle + 0.958037940 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.282804e-02 -Avg ME (F77/GPU) = 1.2828039901590284E-002 -Relative difference = 7.67145379496374e-09 +Avg ME (F77/GPU) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.311831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.509898e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.509898e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.519562 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.088480e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281697e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281697e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.193535 sec INFO: No Floating Point Exceptions have been reported - 18,598,674,944 cycles:u # 3.364 GHz (74.98%) - 52,764,540 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) - 51,457,005 stalled-cycles-backend:u # 0.28% backend cycles idle (74.97%) - 44,694,768,308 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 5.533782344 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) + 17,967,058,694 cycles # 2.899 GHz + 42,467,805,223 instructions # 2.36 insn per cycle + 6.198684795 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.025514e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586424e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586424e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.827235 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.676284e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.231904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.231904e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.170684 sec INFO: No Floating Point Exceptions have been reported - 12,669,139,215 cycles:u # 3.301 GHz (74.99%) - 52,486,208 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.99%) - 835,456,353 stalled-cycles-backend:u # 6.59% backend cycles idle (74.99%) - 30,221,112,857 instructions:u # 2.39 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 3.841570194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) + 12,134,694,075 cycles # 2.906 GHz + 30,224,929,059 instructions # 2.49 insn per cycle + 4.175943490 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.596697e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.455534e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.455534e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.195230 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.950812e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.735198e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.735198e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.639278 sec +INFO: No Floating Point Exceptions have been reported + 10,078,657,444 cycles # 2.766 GHz + 19,257,126,653 instructions # 1.91 insn per cycle + 3.644365244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.049769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.898049e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.898049e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.481211 sec INFO: No Floating Point Exceptions have been reported - 10,281,010,170 cycles:u # 3.228 GHz (74.88%) - 52,319,367 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.92%) - 245,918,154 stalled-cycles-backend:u # 2.39% backend cycles idle (74.92%) - 18,997,713,463 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (75.05%) - 3.226221795 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) + 9,647,917,970 cycles # 2.768 GHz + 18,746,418,128 instructions # 1.94 insn per cycle + 3.486360008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.796433e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.409552e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.409552e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.919050 sec +INFO: No Floating Point Exceptions have been reported + 8,244,471,456 cycles # 2.102 GHz + 14,980,246,059 instructions # 1.82 insn per cycle + 3.924194596 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index bf09e0254f..205a4bf5b6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:55:13 +DATE: 2024-05-16_14:35:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.775469e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.965485e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.019901e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.369478 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.201162e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181610e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277713e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.525559 sec INFO: No Floating Point Exceptions have been reported - 879,252,502 cycles:u # 2.222 GHz (74.05%) - 2,104,678 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.60%) - 5,252,436 stalled-cycles-backend:u # 0.60% backend cycles idle (75.78%) - 1,347,222,847 instructions:u # 1.53 insn per cycle - # 0.00 stalled cycles per insn (75.87%) - 0.424733256 seconds time elapsed + 2,155,305,398 cycles # 2.849 GHz + 3,120,666,963 instructions # 1.45 insn per cycle + 0.814520269 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.543325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.609757e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.609757e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.302634 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.129905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129905e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.176250 sec INFO: No Floating Point Exceptions have been reported - 14,784,201,122 cycles:u # 3.427 GHz (74.98%) - 8,981,005 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 3,177,242,207 stalled-cycles-backend:u # 21.49% backend cycles idle (74.97%) - 38,735,076,382 instructions:u # 2.62 insn per cycle - # 0.08 stalled cycles per insn (74.98%) - 4.317845101 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) + 15,001,077,825 cycles # 2.896 GHz + 38,374,710,401 instructions # 2.56 insn per cycle + 5.181415080 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.408116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.628664e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.628664e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.557856 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.492980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.684039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.684039e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.109528 sec INFO: No Floating Point Exceptions have been reported - 8,680,841,989 cycles:u # 3.380 GHz (74.95%) - 10,202,330 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.09%) - 767,677,046 stalled-cycles-backend:u # 8.84% backend cycles idle (75.09%) - 24,310,844,660 instructions:u # 2.80 insn per cycle - # 0.03 stalled cycles per insn (75.09%) - 2.572870820 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) + 9,049,547,879 cycles # 2.906 GHz + 24,578,150,431 instructions # 2.72 insn per cycle + 3.114795475 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.696598e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.291719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.291719e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.541866 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.554648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.034559e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.034559e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.996517 sec +INFO: No Floating Point Exceptions have been reported + 5,443,502,791 cycles # 2.721 GHz + 11,251,469,346 instructions # 2.07 insn per cycle + 2.001703471 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.119114e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.713742e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.713742e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.821745 sec INFO: No Floating Point Exceptions have been reported - 5,114,502,487 cycles:u # 3.292 GHz (74.88%) - 8,727,979 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.78%) - 17,014,329 stalled-cycles-backend:u # 0.33% backend cycles idle (74.79%) - 11,540,817,661 instructions:u # 2.26 insn per cycle - # 0.00 stalled cycles per insn (74.82%) - 1.556824864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) + 4,960,408,882 cycles # 2.716 GHz + 10,558,806,229 instructions # 2.13 insn per cycle + 1.826903839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.693426e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.898518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.898518e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.947911 sec +INFO: No Floating Point Exceptions have been reported + 5,367,244,097 cycles # 1.818 GHz + 7,793,958,391 instructions # 1.45 insn per cycle + 2.953294554 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 8154293ff2..4b2366d44f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,182 +1,231 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:24:26 +DATE: 2024-05-16_15:00:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.949994e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.775129e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.775129e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.227514 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.373758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.924060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.924060e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.825705 sec INFO: No Floating Point Exceptions have been reported - 3,766,128,203 cycles:u # 2.980 GHz (74.39%) - 38,759,428 stalled-cycles-frontend:u # 1.03% frontend cycles idle (75.02%) - 1,153,146,831 stalled-cycles-backend:u # 30.62% backend cycles idle (75.32%) - 3,874,478,439 instructions:u # 1.03 insn per cycle - # 0.30 stalled cycles per insn (75.27%) - 1.286158558 seconds time elapsed + 3,037,157,201 cycles # 2.832 GHz + 4,768,877,833 instructions # 1.57 insn per cycle + 1.128818887 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.536141e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.601805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.601805e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.393097 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.032947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.092197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.092197e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.344113 sec INFO: No Floating Point Exceptions have been reported - 14,922,234,897 cycles:u # 3.381 GHz (75.00%) - 10,271,324 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 3,181,853,965 stalled-cycles-backend:u # 21.32% backend cycles idle (74.99%) - 38,759,660,760 instructions:u # 2.60 insn per cycle - # 0.08 stalled cycles per insn (74.99%) - 4.418072942 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) + 15,315,317,736 cycles # 2.863 GHz + 38,433,762,310 instructions # 2.51 insn per cycle + 5.351126978 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.389394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.605750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.605750e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.656429 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.394451e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.578816e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.578816e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.279010 sec INFO: No Floating Point Exceptions have been reported - 8,847,222,549 cycles:u # 3.306 GHz (74.91%) - 9,795,840 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.91%) - 783,527,311 stalled-cycles-backend:u # 8.86% backend cycles idle (74.91%) - 24,482,823,781 instructions:u # 2.77 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 2.680334047 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) + 9,390,215,737 cycles # 2.859 GHz + 24,761,602,813 instructions # 2.64 insn per cycle + 3.285914811 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.603640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.182504e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.182504e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.645689 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.346272e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.804430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.804430e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.151067 sec INFO: No Floating Point Exceptions have been reported - 5,281,231,320 cycles:u # 3.171 GHz (75.03%) - 9,584,381 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.04%) - 40,941,844 stalled-cycles-backend:u # 0.78% backend cycles idle (75.04%) - 11,783,428,384 instructions:u # 2.23 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 1.669260734 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) + 5,795,064,676 cycles # 2.687 GHz + 11,538,955,643 instructions # 1.99 insn per cycle + 2.157987463 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.949125e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.512113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.512113e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.953091 sec +INFO: No Floating Point Exceptions have been reported + 5,277,608,562 cycles # 2.695 GHz + 10,845,633,589 instructions # 2.06 insn per cycle + 1.960046746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.545325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.736253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.736253e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.149111 sec +INFO: No Floating Point Exceptions have been reported + 5,725,568,726 cycles # 1.815 GHz + 8,037,864,149 instructions # 1.40 insn per cycle + 3.156036160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 6bd2b91592..66fdf9efe4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:31:32 +DATE: 2024-05-16_15:11:59 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.751435e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.956627e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.010731e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.582142e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158915e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274993e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.084285 sec +TOTAL : 0.629579 sec INFO: No Floating Point Exceptions have been reported - 3,225,658,717 cycles:u # 2.977 GHz (74.93%) - 27,364,800 stalled-cycles-frontend:u # 0.85% frontend cycles idle (74.97%) - 1,146,210,651 stalled-cycles-backend:u # 35.53% backend cycles idle (74.95%) - 2,995,970,052 instructions:u # 0.93 insn per cycle - # 0.38 stalled cycles per insn (74.96%) - 1.131832669 seconds time elapsed + 2,438,671,292 cycles # 2.828 GHz + 3,557,518,240 instructions # 1.46 insn per cycle + 0.918692112 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.538537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.604840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.604840e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.063642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.124319e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.124319e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.308650 sec +TOTAL : 5.250994 sec INFO: No Floating Point Exceptions have been reported - 14,804,013,808 cycles:u # 3.428 GHz (74.99%) - 9,582,136 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) - 3,203,914,389 stalled-cycles-backend:u # 21.64% backend cycles idle (74.99%) - 38,731,686,667 instructions:u # 2.62 insn per cycle - # 0.08 stalled cycles per insn (75.00%) - 4.319924242 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) + 15,177,224,624 cycles # 2.888 GHz + 38,389,589,114 instructions # 2.53 insn per cycle + 5.256694767 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.415669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.638907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.638907e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.469246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.659787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.659787e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.560539 sec +TOTAL : 3.192043 sec INFO: No Floating Point Exceptions have been reported - 8,700,534,897 cycles:u # 3.382 GHz (74.89%) - 8,863,726 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.04%) - 770,913,873 stalled-cycles-backend:u # 8.86% backend cycles idle (75.12%) - 24,348,483,963 instructions:u # 2.80 insn per cycle - # 0.03 stalled cycles per insn (75.12%) - 2.574063419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) + 9,234,869,625 cycles # 2.889 GHz + 24,577,322,685 instructions # 2.66 insn per cycle + 3.197667860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.694312e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.299185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.299185e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.520662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.999169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.999169e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.069942 sec +INFO: No Floating Point Exceptions have been reported + 5,642,462,557 cycles # 2.720 GHz + 11,233,692,701 instructions # 1.99 insn per cycle + 2.075542898 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.151383e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.740134e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.740134e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.546948 sec +TOTAL : 1.875924 sec INFO: No Floating Point Exceptions have been reported - 5,119,500,672 cycles:u # 3.283 GHz (74.87%) - 8,490,600 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.88%) - 15,827,668 stalled-cycles-backend:u # 0.31% backend cycles idle (74.88%) - 11,539,261,865 instructions:u # 2.25 insn per cycle - # 0.00 stalled cycles per insn (74.86%) - 1.572322821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) + 5,122,190,825 cycles # 2.724 GHz + 10,508,387,782 instructions # 2.05 insn per cycle + 1.881606947 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.617306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.815381e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.815381e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.070946 sec +INFO: No Floating Point Exceptions have been reported + 5,582,158,144 cycles # 1.816 GHz + 7,742,870,902 instructions # 1.39 insn per cycle + 3.076599052 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 118dde3b8c..82194f6fe3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -40,175 +40,175 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_09:26:58 +DATE: 2024-05-16_15:09:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.590472e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159111e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276500e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.587611e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.161872e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276844e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.568515 sec +TOTAL : 0.568860 sec INFO: No Floating Point Exceptions have been reported - 2,258,768,488 cycles # 2.823 GHz - 3,528,957,770 instructions # 1.56 insn per cycle - 0.856852838 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst + 2,269,706,021 cycles # 2.822 GHz + 3,484,022,632 instructions # 1.54 insn per cycle + 0.860923648 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 Avg ME (F77/GPU) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.044700e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.105241e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.105241e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.045564e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.105865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.105865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.237438 sec +TOTAL : 5.235402 sec INFO: No Floating Point Exceptions have been reported - 15,001,745,020 cycles # 2.862 GHz - 38,376,029,222 instructions # 2.56 insn per cycle - 5.242948759 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 14,997,948,844 cycles # 2.862 GHz + 38,373,416,469 instructions # 2.56 insn per cycle + 5.240872993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.433268e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.620817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.620817e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.444491e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.632712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.632712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.163897 sec +TOTAL : 3.153505 sec INFO: No Floating Point Exceptions have been reported - 9,061,806,813 cycles # 2.860 GHz - 24,577,933,300 instructions # 2.71 insn per cycle - 3.169521571 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 9,049,779,346 cycles # 2.866 GHz + 24,577,971,625 instructions # 2.72 insn per cycle + 3.158944927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.470565e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.942954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.942954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.437117e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.904229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.904229e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.026313 sec +TOTAL : 2.038232 sec INFO: No Floating Point Exceptions have been reported - 5,449,900,362 cycles # 2.684 GHz - 11,251,125,518 instructions # 2.06 insn per cycle - 2.031793334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 5,473,582,641 cycles # 2.680 GHz + 11,251,858,191 instructions # 2.06 insn per cycle + 2.043714380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.065178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.648113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.648113e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.066218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.650713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.650713e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.838036 sec +TOTAL : 1.837947 sec INFO: No Floating Point Exceptions have been reported - 4,937,951,228 cycles # 2.680 GHz - 10,556,745,903 instructions # 2.14 insn per cycle - 1.843600256 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 4,942,309,563 cycles # 2.682 GHz + 10,557,200,123 instructions # 2.14 insn per cycle + 1.844865568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.582246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775292e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.598977e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.794496e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.794496e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.036261 sec +TOTAL : 3.023039 sec INFO: No Floating Point Exceptions have been reported - 5,395,290,125 cycles # 1.774 GHz - 7,793,590,643 instructions # 1.44 insn per cycle - 3.041803947 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,367,715,100 cycles # 1.773 GHz + 7,793,769,749 instructions # 1.45 insn per cycle + 3.028517366 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index dee74b693c..3db0a99453 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,176 +1,220 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:29:22 +DATE: 2024-05-16_15:06:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.803717e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.946630e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.004336e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.170710 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.591450e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156507e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275190e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.728834 sec INFO: No Floating Point Exceptions have been reported - 3,658,150,944 cycles:u # 3.038 GHz (74.90%) - 37,981,899 stalled-cycles-frontend:u # 1.04% frontend cycles idle (74.67%) - 1,144,066,478 stalled-cycles-backend:u # 31.27% backend cycles idle (74.61%) - 3,900,221,248 instructions:u # 1.07 insn per cycle - # 0.29 stalled cycles per insn (74.78%) - 1.221748818 seconds time elapsed + 2,711,621,820 cycles # 2.826 GHz + 4,288,575,941 instructions # 1.58 insn per cycle + 1.017933550 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.542224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.608649e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.608649e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.303981 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.045774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.105988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.105988e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.234926 sec INFO: No Floating Point Exceptions have been reported - 14,799,335,844 cycles:u # 3.430 GHz (74.97%) - 9,368,179 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 3,188,931,778 stalled-cycles-backend:u # 21.55% backend cycles idle (74.97%) - 38,751,756,847 instructions:u # 2.62 insn per cycle - # 0.08 stalled cycles per insn (74.97%) - 4.315796772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) + 14,996,539,700 cycles # 2.862 GHz + 38,373,492,139 instructions # 2.56 insn per cycle + 5.240540958 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.419387e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.638360e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.638360e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.551312 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.431303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617753e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.165641 sec INFO: No Floating Point Exceptions have been reported - 8,648,533,390 cycles:u # 3.376 GHz (75.02%) - 9,892,072 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) - 768,203,756 stalled-cycles-backend:u # 8.88% backend cycles idle (75.02%) - 24,374,130,896 instructions:u # 2.82 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 2.562833032 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) + 9,072,261,960 cycles # 2.862 GHz + 24,578,342,604 instructions # 2.71 insn per cycle + 3.171145800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.676900e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.271425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.271425e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.546104 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.460196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.936686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.936686e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.030403 sec +INFO: No Floating Point Exceptions have been reported + 5,452,336,471 cycles # 2.679 GHz + 11,251,160,510 instructions # 2.06 insn per cycle + 2.035938093 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.063893e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.649981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.649981e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.838982 sec INFO: No Floating Point Exceptions have been reported - 5,132,958,800 cycles:u # 3.297 GHz (74.83%) - 9,334,961 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.84%) - 25,693,530 stalled-cycles-backend:u # 0.50% backend cycles idle (74.84%) - 11,563,000,912 instructions:u # 2.25 insn per cycle - # 0.00 stalled cycles per insn (74.88%) - 1.562529164 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) + 4,938,631,038 cycles # 2.680 GHz + 10,556,930,414 instructions # 2.14 insn per cycle + 1.844618889 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.589787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.785615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785615e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.030446 sec +INFO: No Floating Point Exceptions have been reported + 5,385,276,295 cycles # 1.774 GHz + 7,793,583,016 instructions # 1.45 insn per cycle + 3.036161028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index c6a49a35fb..0caf1293cf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:55:26 +DATE: 2024-05-16_14:35:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.855896e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923148e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.976161e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.369294 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.206695e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183658e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279171e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.521961 sec INFO: No Floating Point Exceptions have been reported - 874,412,714 cycles:u # 2.211 GHz (75.09%) - 2,032,981 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.81%) - 5,400,709 stalled-cycles-backend:u # 0.62% backend cycles idle (76.16%) - 1,373,590,793 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (74.85%) - 0.421181023 seconds time elapsed + 2,148,802,757 cycles # 2.845 GHz + 3,054,152,486 instructions # 1.42 insn per cycle + 0.812117976 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.454763e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.516817e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.516817e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.450589 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068168e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.129039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129039e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.177399 sec INFO: No Floating Point Exceptions have been reported - 15,338,008,426 cycles:u # 3.437 GHz (74.93%) - 9,263,818 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) - 91,044,693 stalled-cycles-backend:u # 0.59% backend cycles idle (74.95%) - 39,502,158,984 instructions:u # 2.58 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 4.466196489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) + 15,011,872,798 cycles # 2.897 GHz + 40,100,761,049 instructions # 2.67 insn per cycle + 5.182501125 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.420771e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.639726e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.639726e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.551384 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.634343e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.844834e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.844834e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.993727 sec INFO: No Floating Point Exceptions have been reported - 8,653,309,782 cycles:u # 3.377 GHz (75.02%) - 10,341,757 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.03%) - 1,240,769,587 stalled-cycles-backend:u # 14.34% backend cycles idle (75.03%) - 23,497,288,002 instructions:u # 2.72 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 2.566849143 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) + 8,671,029,072 cycles # 2.892 GHz + 23,670,969,931 instructions # 2.73 insn per cycle + 2.999072752 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.942598e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.421256e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.421256e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.690103 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.945254e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.323667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.323667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.228700 sec +INFO: No Floating Point Exceptions have been reported + 6,081,438,462 cycles # 2.724 GHz + 13,061,002,322 instructions # 2.15 insn per cycle + 2.233958089 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.205594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.622405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.622405e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.121856 sec INFO: No Floating Point Exceptions have been reported - 5,631,807,081 cycles:u # 3.310 GHz (74.95%) - 9,115,540 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.08%) - 769,964,712 stalled-cycles-backend:u # 13.67% backend cycles idle (75.08%) - 13,135,699,077 instructions:u # 2.33 insn per cycle - # 0.06 stalled cycles per insn (75.08%) - 1.705406499 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) + 5,798,891,312 cycles # 2.727 GHz + 12,319,969,769 instructions # 2.12 insn per cycle + 2.127030294 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.380432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.550251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.550251e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.209519 sec +INFO: No Floating Point Exceptions have been reported + 5,821,355,640 cycles # 1.812 GHz + 9,603,981,726 instructions # 1.65 insn per cycle + 3.214724733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 81d64ef62d..6af05ea7e1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:12:54 +DATE: 2024-05-16_14:51:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.860003e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.960397e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.014556e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.367937 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.681198e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166116e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276872e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529611 sec INFO: No Floating Point Exceptions have been reported - 868,560,474 cycles:u # 2.222 GHz (75.41%) - 2,087,259 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.63%) - 4,704,932 stalled-cycles-backend:u # 0.54% backend cycles idle (75.48%) - 1,382,708,731 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 0.419779048 seconds time elapsed + 2,190,477,637 cycles # 2.832 GHz + 3,135,955,530 instructions # 1.43 insn per cycle + 0.830299558 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.849959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.933619e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.933619e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.858026 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.383572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.466296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.466296e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.507918 sec INFO: No Floating Point Exceptions have been reported - 13,230,268,308 cycles:u # 3.420 GHz (75.00%) - 9,196,867 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) - 1,450,173,701 stalled-cycles-backend:u # 10.96% backend cycles idle (74.98%) - 35,846,754,718 instructions:u # 2.71 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 3.872756359 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) + 13,013,442,526 cycles # 2.884 GHz + 34,387,029,075 instructions # 2.64 insn per cycle + 4.513459426 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.440638e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.661874e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.661874e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.540476 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.946707e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.083881e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083881e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.667816 sec INFO: No Floating Point Exceptions have been reported - 8,617,423,314 cycles:u # 3.377 GHz (74.92%) - 9,742,559 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.92%) - 2,419,994,807 stalled-cycles-backend:u # 28.08% backend cycles idle (74.93%) - 21,899,760,348 instructions:u # 2.54 insn per cycle - # 0.11 stalled cycles per insn (74.93%) - 2.554989772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) + 10,591,846,077 cycles # 2.884 GHz + 24,007,245,790 instructions # 2.27 insn per cycle + 3.673406920 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.697071e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.140586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.140586e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.745335 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.532632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.849376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.849376e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.423240 sec INFO: No Floating Point Exceptions have been reported - 5,811,955,870 cycles:u # 3.309 GHz (74.99%) - 8,545,914 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.95%) - 1,902,564,136 stalled-cycles-backend:u # 32.74% backend cycles idle (74.95%) - 12,067,881,535 instructions:u # 2.08 insn per cycle - # 0.16 stalled cycles per insn (74.96%) - 1.759875043 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) + 6,577,855,979 cycles # 2.709 GHz + 12,401,365,684 instructions # 1.89 insn per cycle + 2.428791768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.754457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.104775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.104775e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.314834 sec +INFO: No Floating Point Exceptions have been reported + 6,233,998,487 cycles # 2.688 GHz + 11,576,068,199 instructions # 1.86 insn per cycle + 2.320534715 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.687851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.893233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.893233e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.952132 sec +INFO: No Floating Point Exceptions have been reported + 5,323,772,693 cycles # 1.802 GHz + 9,296,912,008 instructions # 1.75 insn per cycle + 2.957828928 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 30bb9dc816..2040ec21eb 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:13:07 +DATE: 2024-05-16_14:52:19 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.855524e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.918252e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.971224e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.365301 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.680230e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168644e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.280417e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.532171 sec INFO: No Floating Point Exceptions have been reported - 875,261,092 cycles:u # 2.238 GHz (74.74%) - 2,145,696 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.52%) - 4,783,588 stalled-cycles-backend:u # 0.55% backend cycles idle (75.37%) - 1,404,640,435 instructions:u # 1.60 insn per cycle - # 0.00 stalled cycles per insn (75.93%) - 0.416048016 seconds time elapsed + 2,169,507,018 cycles # 2.828 GHz + 3,115,355,964 instructions # 1.44 insn per cycle + 0.826043020 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/GPU) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.240552e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.349765e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.349765e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.415620 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.524819e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.617052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.617052e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.262483 sec INFO: No Floating Point Exceptions have been reported - 11,671,621,494 cycles:u # 3.406 GHz (75.03%) - 9,251,807 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.03%) - 68,281,414 stalled-cycles-backend:u # 0.59% backend cycles idle (75.03%) - 35,744,496,973 instructions:u # 3.06 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.431955897 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) + 12,358,560,610 cycles # 2.896 GHz + 35,037,446,637 instructions # 2.84 insn per cycle + 4.268207887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.823848e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.086401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.086401e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.352281 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.908483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.040450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.040450e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.714757 sec INFO: No Floating Point Exceptions have been reported - 7,949,427,078 cycles:u # 3.364 GHz (74.95%) - 9,527,461 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.95%) - 1,447,684,678 stalled-cycles-backend:u # 18.21% backend cycles idle (74.96%) - 21,256,961,664 instructions:u # 2.67 insn per cycle - # 0.07 stalled cycles per insn (74.97%) - 2.367088619 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) + 10,745,562,014 cycles # 2.889 GHz + 23,084,374,218 instructions # 2.15 insn per cycle + 3.720383315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.971872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.610345e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.610345e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.493454 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.878271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.246530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.246530e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.258864 sec +INFO: No Floating Point Exceptions have been reported + 6,151,591,588 cycles # 2.717 GHz + 11,956,808,073 instructions # 1.94 insn per cycle + 2.264473200 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.958079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.345089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.345089e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.224234 sec INFO: No Floating Point Exceptions have been reported - 4,944,281,841 cycles:u # 3.286 GHz (75.01%) - 9,685,586 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.01%) - 190,981,785 stalled-cycles-backend:u # 3.86% backend cycles idle (75.01%) - 11,368,416,061 instructions:u # 2.30 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 1.507919605 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) + 6,017,653,055 cycles # 2.700 GHz + 11,128,128,624 instructions # 1.85 insn per cycle + 2.229785356 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.739650e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.951827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.951827e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.913360 sec +INFO: No Floating Point Exceptions have been reported + 5,212,798,448 cycles # 1.786 GHz + 9,020,884,070 instructions # 1.73 insn per cycle + 2.919040069 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index d02c785b32..93f412dad4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:55:40 +DATE: 2024-05-16_14:35:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.691037e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.922898e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084230e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.316605 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.088595e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.705968e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969781e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.482195 sec INFO: No Floating Point Exceptions have been reported - 791,598,853 cycles:u # 2.324 GHz (74.51%) - 2,217,246 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.32%) - 4,488,513 stalled-cycles-backend:u # 0.57% backend cycles idle (73.63%) - 1,374,860,279 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (73.75%) - 0.365544914 seconds time elapsed + 2,007,920,858 cycles # 2.849 GHz + 2,840,933,430 instructions # 1.41 insn per cycle + 0.763422225 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.996036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.086137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.086137e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.645410 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.200574e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.271569e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.271569e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.850875 sec INFO: No Floating Point Exceptions have been reported - 12,564,694,817 cycles:u # 3.440 GHz (75.03%) - 7,266,320 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.03%) - 1,267,318,331 stalled-cycles-backend:u # 10.09% backend cycles idle (75.03%) - 37,032,357,445 instructions:u # 2.95 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 3.656882243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) + 14,073,569,281 cycles # 2.899 GHz + 38,343,239,881 instructions # 2.72 insn per cycle + 4.855897587 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.141600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.553169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.553169e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.851793 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.925449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.332953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.332953e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.217076 sec INFO: No Floating Point Exceptions have been reported - 6,282,909,767 cycles:u # 3.379 GHz (75.06%) - 6,700,989 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.05%) - 2,244,994,692 stalled-cycles-backend:u # 35.73% backend cycles idle (75.05%) - 15,180,893,512 instructions:u # 2.42 insn per cycle - # 0.15 stalled cycles per insn (75.05%) - 1.863223677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) + 6,436,588,824 cycles # 2.899 GHz + 15,815,821,412 instructions # 2.46 insn per cycle + 2.222049918 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.211586e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.365404e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.365404e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.010286 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.963004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029520e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029520e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.256665 sec INFO: No Floating Point Exceptions have been reported - 3,328,108,462 cycles:u # 3.269 GHz (74.87%) - 9,577,106 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.86%) - 1,110,990,485 stalled-cycles-backend:u # 33.38% backend cycles idle (74.85%) - 7,706,706,733 instructions:u # 2.32 insn per cycle - # 0.14 stalled cycles per insn (74.89%) - 1.021544489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) + 3,455,760,948 cycles # 2.740 GHz + 7,593,976,565 instructions # 2.20 insn per cycle + 1.261861875 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.569986e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110539e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110539e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.182427 sec +INFO: No Floating Point Exceptions have been reported + 3,244,770,474 cycles # 2.734 GHz + 7,203,559,407 instructions # 2.22 insn per cycle + 1.187623854 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.864494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.605662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.614546 sec +INFO: No Floating Point Exceptions have been reported + 3,050,749,421 cycles # 1.885 GHz + 5,835,755,685 instructions # 1.91 insn per cycle + 1.619564037 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 576ba54bb8..426db838d7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,182 +1,231 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:24:40 +DATE: 2024-05-16_15:01:19 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.446100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088052e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088052e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.145963 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.801236e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.462846e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.462846e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.684862 sec INFO: No Floating Point Exceptions have been reported - 3,556,692,327 cycles:u # 3.028 GHz (75.04%) - 21,296,336 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.87%) - 1,141,762,983 stalled-cycles-backend:u # 32.10% backend cycles idle (74.70%) - 3,938,994,900 instructions:u # 1.11 insn per cycle - # 0.29 stalled cycles per insn (74.66%) - 1.199694980 seconds time elapsed + 2,586,573,508 cycles # 2.828 GHz + 4,016,406,941 instructions # 1.55 insn per cycle + 0.971565490 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.986790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.076717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.076717e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.695930 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.176436e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247449e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.948763 sec INFO: No Floating Point Exceptions have been reported - 12,667,992,373 cycles:u # 3.418 GHz (75.01%) - 8,515,560 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.97%) - 1,269,480,757 stalled-cycles-backend:u # 10.02% backend cycles idle (74.97%) - 37,129,718,188 instructions:u # 2.93 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 3.710289134 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) + 14,176,104,430 cycles # 2.862 GHz + 38,383,843,895 instructions # 2.71 insn per cycle + 4.955194603 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.111745e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.510250e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.510250e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.923322 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.809798e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.200764e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.315849 sec INFO: No Floating Point Exceptions have been reported - 6,419,461,019 cycles:u # 3.319 GHz (74.79%) - 7,200,229 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.78%) - 2,253,441,771 stalled-cycles-backend:u # 35.10% backend cycles idle (74.99%) - 15,444,957,018 instructions:u # 2.41 insn per cycle - # 0.15 stalled cycles per insn (75.18%) - 1.937561802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) + 6,633,418,276 cycles # 2.858 GHz + 16,095,968,093 instructions # 2.43 insn per cycle + 2.322298973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.207790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.361965e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.361965e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.057264 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.679036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.925640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.925640e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.342355 sec INFO: No Floating Point Exceptions have been reported - 3,433,393,334 cycles:u # 3.215 GHz (74.64%) - 7,369,251 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.57%) - 1,103,961,653 stalled-cycles-backend:u # 32.15% backend cycles idle (74.94%) - 7,843,923,093 instructions:u # 2.28 insn per cycle - # 0.14 stalled cycles per insn (75.29%) - 1.071719104 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) + 3,640,592,514 cycles # 2.701 GHz + 7,831,268,120 instructions # 2.15 insn per cycle + 1.348786146 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.163700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.056629e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056629e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.278871 sec +INFO: No Floating Point Exceptions have been reported + 3,437,646,895 cycles # 2.676 GHz + 7,439,842,858 instructions # 2.16 insn per cycle + 1.285386542 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.597215e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.292791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.292791e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.727252 sec +INFO: No Floating Point Exceptions have been reported + 3,258,697,081 cycles # 1.881 GHz + 6,089,840,836 instructions # 1.87 insn per cycle + 1.733818978 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 062bed307f..884891874e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:31:46 +DATE: 2024-05-16_15:12:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572863e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910914e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.070655e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.010993 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.468958e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.648278e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.971571e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.575797 sec INFO: No Floating Point Exceptions have been reported - 3,134,574,677 cycles:u # 3.018 GHz (74.53%) - 10,844,566 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.60%) - 1,149,701,699 stalled-cycles-backend:u # 36.68% backend cycles idle (74.86%) - 2,833,944,222 instructions:u # 0.90 insn per cycle - # 0.41 stalled cycles per insn (75.33%) - 1.064507635 seconds time elapsed + 2,271,357,910 cycles # 2.845 GHz + 3,342,640,625 instructions # 1.47 insn per cycle + 0.855647595 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.994584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084810e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084810e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.198151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269622e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269622e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.651032 sec +TOTAL : 4.914840 sec INFO: No Floating Point Exceptions have been reported - 12,626,581,938 cycles:u # 3.451 GHz (74.87%) - 8,210,222 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.89%) - 1,262,658,310 stalled-cycles-backend:u # 10.00% backend cycles idle (74.99%) - 36,980,093,862 instructions:u # 2.93 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 3.660540982 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) + 14,211,276,974 cycles # 2.889 GHz + 38,370,210,397 instructions # 2.70 insn per cycle + 4.920108721 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.361901e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.796195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.796195e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.796173 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.892733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.301573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.301573e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 2.288479 sec INFO: No Floating Point Exceptions have been reported - 6,122,999,769 cycles:u # 3.394 GHz (74.63%) - 6,731,474 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.05%) - 2,127,642,873 stalled-cycles-backend:u # 34.75% backend cycles idle (75.17%) - 15,142,149,977 instructions:u # 2.47 insn per cycle - # 0.14 stalled cycles per insn (75.17%) - 1.805751630 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) + 6,608,042,838 cycles # 2.882 GHz + 15,829,158,403 instructions # 2.40 insn per cycle + 2.293691008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.218260e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.373989e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.373989e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.009702 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.919042e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023820e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023820e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.319201 sec INFO: No Floating Point Exceptions have been reported - 3,305,189,551 cycles:u # 3.250 GHz (74.85%) - 6,923,299 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.84%) - 1,100,547,103 stalled-cycles-backend:u # 33.30% backend cycles idle (74.84%) - 7,685,395,377 instructions:u # 2.33 insn per cycle - # 0.14 stalled cycles per insn (74.88%) - 1.019465527 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) + 3,618,631,378 cycles # 2.734 GHz + 7,578,247,859 instructions # 2.09 insn per cycle + 1.324366743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.492699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.100151e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.100151e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.250231 sec +INFO: No Floating Point Exceptions have been reported + 3,418,366,623 cycles # 2.724 GHz + 7,152,275,486 instructions # 2.09 insn per cycle + 1.255758340 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.830732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.562097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562097e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.681529 sec +INFO: No Floating Point Exceptions have been reported + 3,218,452,038 cycles # 1.909 GHz + 5,786,270,960 instructions # 1.80 insn per cycle + 1.686847993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 5dd9924337..9b5852a8c1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -40,175 +40,175 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_09:27:22 +DATE: 2024-05-16_15:09:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.457342e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.645322e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969571e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.497286e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.653761e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.976765e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.520144 sec +TOTAL : 0.520499 sec INFO: No Floating Point Exceptions have been reported - 2,098,800,484 cycles # 2.818 GHz - 3,302,360,884 instructions # 1.57 insn per cycle - 0.802331353 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst + 2,122,949,824 cycles # 2.819 GHz + 3,308,605,661 instructions # 1.56 insn per cycle + 0.811337951 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 Avg ME (F77/GPU) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183495e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.256672e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256672e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.187282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.258952e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.258952e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.891525 sec +TOTAL : 4.881435 sec INFO: No Floating Point Exceptions have been reported - 14,004,521,708 cycles # 2.861 GHz - 38,341,043,198 instructions # 2.74 insn per cycle - 4.896650903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 13,993,887,356 cycles # 2.864 GHz + 38,340,879,445 instructions # 2.74 insn per cycle + 4.886765699 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.861114e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.259842e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.259842e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.866184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.266559e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.266559e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.245870 sec +TOTAL : 2.243695 sec INFO: No Floating Point Exceptions have been reported - 6,439,615,857 cycles # 2.862 GHz - 15,815,454,278 instructions # 2.46 insn per cycle - 2.251122058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 6,437,628,216 cycles # 2.863 GHz + 15,815,570,005 instructions # 2.46 insn per cycle + 2.248941783 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.808799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.009394e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.009394e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.699018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.949673e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.949673e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.278168 sec +TOTAL : 1.293092 sec INFO: No Floating Point Exceptions have been reported - 3,449,152,774 cycles # 2.689 GHz - 7,593,321,212 instructions # 2.20 insn per cycle - 1.283559283 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,447,035,685 cycles # 2.657 GHz + 7,594,377,345 instructions # 2.20 insn per cycle + 1.298317015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.428242e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091809e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091809e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.410196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089229e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.089229e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.200122 sec +TOTAL : 1.202332 sec INFO: No Floating Point Exceptions have been reported - 3,241,815,360 cycles # 2.691 GHz - 7,203,446,508 instructions # 2.22 insn per cycle - 1.205490381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,248,094,322 cycles # 2.691 GHz + 7,201,883,054 instructions # 2.22 insn per cycle + 1.207739630 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.709652e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.425629e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.425629e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.682713e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.392370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.392370e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.651222 sec +TOTAL : 1.657517 sec INFO: No Floating Point Exceptions have been reported - 3,052,193,399 cycles # 1.843 GHz - 5,835,644,896 instructions # 1.91 insn per cycle - 1.656436466 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,060,341,406 cycles # 1.842 GHz + 5,836,262,166 instructions # 1.91 insn per cycle + 1.662864711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 24ddfc0d54..7e3b1fa48e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,176 +1,220 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:29:36 +DATE: 2024-05-16_15:06:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.140243e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.867331e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.041352e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.388830 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.502594e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.623050e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.943883e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.628666 sec INFO: No Floating Point Exceptions have been reported - 3,543,456,227 cycles:u # 3.058 GHz (75.15%) - 22,706,543 stalled-cycles-frontend:u # 0.64% frontend cycles idle (75.07%) - 1,153,445,105 stalled-cycles-backend:u # 32.55% backend cycles idle (74.96%) - 3,723,010,563 instructions:u # 1.05 insn per cycle - # 0.31 stalled cycles per insn (75.42%) - 1.449467945 seconds time elapsed + 2,403,264,425 cycles # 2.820 GHz + 3,734,811,294 instructions # 1.55 insn per cycle + 0.909767197 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.990535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080344e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080344e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.652431 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.185686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.257300e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.257300e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.884819 sec INFO: No Floating Point Exceptions have been reported - 12,642,355,107 cycles:u # 3.456 GHz (74.87%) - 8,841,825 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.89%) - 1,259,246,631 stalled-cycles-backend:u # 9.96% backend cycles idle (74.99%) - 36,993,375,891 instructions:u # 2.93 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 3.660280036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) + 13,995,449,913 cycles # 2.863 GHz + 38,340,978,131 instructions # 2.74 insn per cycle + 4.889991891 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.359128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.802679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.802679e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.793583 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.864053e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.263128e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.263128e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.244540 sec INFO: No Floating Point Exceptions have been reported - 6,111,767,563 cycles:u # 3.396 GHz (74.81%) - 6,718,217 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) - 2,130,349,767 stalled-cycles-backend:u # 34.86% backend cycles idle (75.11%) - 15,164,888,939 instructions:u # 2.48 insn per cycle - # 0.14 stalled cycles per insn (75.11%) - 1.801108158 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) + 6,436,419,349 cycles # 2.862 GHz + 15,815,556,279 instructions # 2.46 insn per cycle + 2.249779623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.221926e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379276e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.004314 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.799961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.008748e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.008748e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.279634 sec INFO: No Floating Point Exceptions have been reported - 3,308,642,232 cycles:u # 3.272 GHz (74.70%) - 7,145,983 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.68%) - 1,106,626,180 stalled-cycles-backend:u # 33.45% backend cycles idle (74.71%) - 7,715,445,054 instructions:u # 2.33 insn per cycle - # 0.14 stalled cycles per insn (74.87%) - 1.012317854 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) + 3,447,592,643 cycles # 2.685 GHz + 7,593,708,789 instructions # 2.20 insn per cycle + 1.284877623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.434984e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092289e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092289e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.198937 sec +INFO: No Floating Point Exceptions have been reported + 3,242,375,801 cycles # 2.694 GHz + 7,202,509,960 instructions # 2.22 insn per cycle + 1.204245270 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.713311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.432943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.432943e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.650810 sec +INFO: No Floating Point Exceptions have been reported + 3,050,285,995 cycles # 1.842 GHz + 5,834,789,164 instructions # 1.91 insn per cycle + 1.656446986 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 6a76e7f32a..3e123e6fd7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:55:51 +DATE: 2024-05-16_14:36:11 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.030559e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087069e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.266399e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.315447 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.096553e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.763289e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.037690e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.480283 sec INFO: No Floating Point Exceptions have been reported - 781,515,701 cycles:u # 2.305 GHz (74.21%) - 2,282,387 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.66%) - 4,790,131 stalled-cycles-backend:u # 0.61% backend cycles idle (75.47%) - 1,375,233,761 instructions:u # 1.76 insn per cycle - # 0.00 stalled cycles per insn (74.46%) - 0.365017431 seconds time elapsed + 2,036,711,218 cycles # 2.852 GHz + 2,918,453,967 instructions # 1.43 insn per cycle + 0.771336406 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.980055e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.069168e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.069168e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.663191 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.166079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.236793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.236793e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.926463 sec INFO: No Floating Point Exceptions have been reported - 12,652,429,315 cycles:u # 3.447 GHz (74.99%) - 7,340,081 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) - 8,136,328 stalled-cycles-backend:u # 0.06% backend cycles idle (74.94%) - 37,447,429,884 instructions:u # 2.96 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 3.674903820 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) + 14,320,299,267 cycles # 2.905 GHz + 39,836,243,439 instructions # 2.78 insn per cycle + 4.931482509 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199028000236 +Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.327850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.919361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.919361e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.574158 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.723514e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.285593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.285593e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.919156 sec INFO: No Floating Point Exceptions have been reported - 5,334,856,873 cycles:u # 3.373 GHz (74.76%) - 6,903,179 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.73%) - 1,392,837,612 stalled-cycles-backend:u # 26.11% backend cycles idle (74.91%) - 15,200,021,814 instructions:u # 2.85 insn per cycle - # 0.09 stalled cycles per insn (75.15%) - 1.585770741 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) + 5,582,245,803 cycles # 2.902 GHz + 15,285,424,302 instructions # 2.74 insn per cycle + 1.924109376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 8.856882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.651769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.651769e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.328611 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.349024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.991002e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.991002e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.737963 sec INFO: No Floating Point Exceptions have been reported - 4,447,476,075 cycles:u # 3.328 GHz (74.87%) - 8,419,190 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.86%) - 1,668,128,471 stalled-cycles-backend:u # 37.51% backend cycles idle (74.86%) - 9,858,565,170 instructions:u # 2.22 insn per cycle - # 0.17 stalled cycles per insn (74.88%) - 1.340175033 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) + 4,749,494,972 cycles # 2.726 GHz + 9,735,095,064 instructions # 2.05 insn per cycle + 1.742978161 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186428369954 -Relative difference = 1.7604478492421832e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.536931e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.219273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.219273e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.690263 sec +INFO: No Floating Point Exceptions have been reported + 4,623,322,631 cycles # 2.728 GHz + 9,325,575,279 instructions # 2.02 insn per cycle + 1.695318457 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.572579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.052133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.052133e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.968034 sec +INFO: No Floating Point Exceptions have been reported + 3,660,831,684 cycles # 1.856 GHz + 7,034,974,988 instructions # 1.92 insn per cycle + 1.973212700 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2610) (512y: 12) (512z: 2220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183459779248 +Relative difference = 1.7053177021099307e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index b333e48464..c7eded0fc2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:13:19 +DATE: 2024-05-16_14:52:42 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.852829e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.914888e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.075023e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.315754 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.456356e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.657836e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.983561e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.487671 sec INFO: No Floating Point Exceptions have been reported - 791,616,602 cycles:u # 2.344 GHz (74.09%) - 2,195,721 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.16%) - 4,345,045 stalled-cycles-backend:u # 0.55% backend cycles idle (73.35%) - 1,344,792,490 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (74.12%) - 0.372298758 seconds time elapsed + 2,030,099,363 cycles # 2.844 GHz + 2,856,891,631 instructions # 1.41 insn per cycle + 0.771313393 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.218298e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322486e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.403136 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.397227e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481743e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481743e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.460576 sec INFO: No Floating Point Exceptions have been reported - 11,740,679,609 cycles:u # 3.442 GHz (74.91%) - 7,350,250 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.80%) - 10,967,997 stalled-cycles-backend:u # 0.09% backend cycles idle (74.88%) - 34,167,335,720 instructions:u # 2.91 insn per cycle - # 0.00 stalled cycles per insn (75.11%) - 3.414881992 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) + 12,588,647,411 cycles # 2.819 GHz + 34,372,288,545 instructions # 2.73 insn per cycle + 4.465853868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.316927e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.900595e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.900595e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.577815 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.225217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.687950e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.687950e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.095644 sec INFO: No Floating Point Exceptions have been reported - 5,328,150,452 cycles:u # 3.361 GHz (74.84%) - 6,566,781 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.77%) - 2,195,911,479 stalled-cycles-backend:u # 41.21% backend cycles idle (74.79%) - 14,603,452,114 instructions:u # 2.74 insn per cycle - # 0.15 stalled cycles per insn (74.91%) - 1.588843890 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) + 6,085,238,066 cycles # 2.897 GHz + 14,860,574,019 instructions # 2.44 insn per cycle + 2.101017455 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198769558221 -Relative difference = 6.06481491495597e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193803280592 +Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.452010e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.036286e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.036286e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.253909 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.969640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.750011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.750011e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.592133 sec INFO: No Floating Point Exceptions have been reported - 4,221,700,211 cycles:u # 3.347 GHz (74.54%) - 7,836,775 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.66%) - 1,638,737,482 stalled-cycles-backend:u # 38.82% backend cycles idle (74.93%) - 9,052,929,189 instructions:u # 2.14 insn per cycle - # 0.18 stalled cycles per insn (75.25%) - 1.265197675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) + 4,316,607,801 cycles # 2.703 GHz + 9,028,975,402 instructions # 2.09 insn per cycle + 1.597664902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186752004549 -Relative difference = 1.6009291367898262e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.187100e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.023996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.023996e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.545422 sec +INFO: No Floating Point Exceptions have been reported + 4,204,195,380 cycles # 2.712 GHz + 8,663,569,400 instructions # 2.06 insn per cycle + 1.550927334 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.251438e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.680453e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.680453e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.083936 sec +INFO: No Floating Point Exceptions have been reported + 3,833,998,104 cycles # 1.836 GHz + 7,808,361,622 instructions # 2.04 insn per cycle + 2.089489123 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4424) (512y: 0) (512z: 2555) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183246739209 +Relative difference = 1.6003107281264138e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 2b639f2cd8..aad34f68a4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:13:30 +DATE: 2024-05-16_14:53:02 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.951513e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.082031e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.262905e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.315292 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.520611e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.721194e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.056652e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485221 sec INFO: No Floating Point Exceptions have been reported - 799,120,568 cycles:u # 2.363 GHz (73.58%) - 2,236,588 stalled-cycles-frontend:u # 0.28% frontend cycles idle (73.68%) - 4,188,322 stalled-cycles-backend:u # 0.52% backend cycles idle (73.82%) - 1,326,780,634 instructions:u # 1.66 insn per cycle - # 0.00 stalled cycles per insn (75.86%) - 0.364696263 seconds time elapsed + 2,023,639,378 cycles # 2.841 GHz + 2,891,046,466 instructions # 1.43 insn per cycle + 0.769493206 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 2.028815e+00 -Avg ME (F77/GPU) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 2.028811e+00 +Avg ME (F77/GPU) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.488171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.610948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.610948e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.150894 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.614708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.719370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719370e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.097789 sec INFO: No Floating Point Exceptions have been reported - 10,852,739,826 cycles:u # 3.436 GHz (74.93%) - 7,674,451 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) - 14,412,989 stalled-cycles-backend:u # 0.13% backend cycles idle (74.93%) - 35,477,674,159 instructions:u # 3.27 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 3.162235761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) + 11,755,034,517 cycles # 2.866 GHz + 35,108,588,793 instructions # 2.99 insn per cycle + 4.103114971 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.908113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.598784e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.598784e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.469351 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.332294e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.809853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.809853e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.053683 sec INFO: No Floating Point Exceptions have been reported - 4,952,050,706 cycles:u # 3.353 GHz (74.99%) - 7,312,668 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.08%) - 1,212,184,247 stalled-cycles-backend:u # 24.48% backend cycles idle (75.08%) - 14,017,353,219 instructions:u # 2.83 insn per cycle - # 0.09 stalled cycles per insn (75.08%) - 1.480455473 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) + 5,951,415,517 cycles # 2.891 GHz + 14,470,123,335 instructions # 2.43 insn per cycle + 2.059025817 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198892958462 -Relative difference = 5.4565783974899003e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193583255634 +Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.907717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091306e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091306e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.202600 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.326940e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.191185e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.191185e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.518155 sec INFO: No Floating Point Exceptions have been reported - 4,006,809,715 cycles:u # 3.311 GHz (74.88%) - 7,948,136 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.88%) - 1,400,349,482 stalled-cycles-backend:u # 34.95% backend cycles idle (74.90%) - 8,634,852,093 instructions:u # 2.16 insn per cycle - # 0.16 stalled cycles per insn (74.90%) - 1.213561260 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) + 4,152,217,913 cycles # 2.727 GHz + 8,874,854,960 instructions # 2.14 insn per cycle + 1.523530355 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186836987734 -Relative difference = 1.559041129563128e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.326335e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.192412e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.192412e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.518142 sec +INFO: No Floating Point Exceptions have been reported + 4,138,145,120 cycles # 2.717 GHz + 8,411,511,000 instructions # 2.03 insn per cycle + 1.523559219 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.337364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.777859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.777859e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.053123 sec +INFO: No Floating Point Exceptions have been reported + 3,784,038,038 cycles # 1.840 GHz + 7,702,433,783 instructions # 2.04 insn per cycle + 2.058532499 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3440) (512y: 0) (512z: 2107) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183204829693 +Relative difference = 1.5796536184903122e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 4ba5b34172..ff88d5da2d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:56:02 +DATE: 2024-05-16_14:36:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.872613e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.017576e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.073456e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.367222 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.198792e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180605e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275668e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.521467 sec INFO: No Floating Point Exceptions have been reported - 881,192,883 cycles:u # 2.237 GHz (75.41%) - 2,223,709 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.69%) - 5,425,251 stalled-cycles-backend:u # 0.62% backend cycles idle (75.82%) - 1,433,173,477 instructions:u # 1.63 insn per cycle - # 0.00 stalled cycles per insn (75.82%) - 0.420951978 seconds time elapsed + 2,143,649,339 cycles # 2.843 GHz + 3,098,162,725 instructions # 1.45 insn per cycle + 0.810608393 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/GPU) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.471506e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.533081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.533081e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.422100 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.033714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.092456e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.092456e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.262850 sec INFO: No Floating Point Exceptions have been reported - 15,216,966,577 cycles:u # 3.432 GHz (74.97%) - 8,827,525 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) - 878,631,864 stalled-cycles-backend:u # 5.77% backend cycles idle (74.92%) - 39,297,637,783 instructions:u # 2.58 insn per cycle - # 0.02 stalled cycles per insn (74.92%) - 4.437266712 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) + 15,278,986,093 cycles # 2.901 GHz + 38,575,389,182 instructions # 2.52 insn per cycle + 5.268064562 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.487635e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.709666e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.709666e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.516289 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.527314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723139e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.080390 sec INFO: No Floating Point Exceptions have been reported - 8,519,765,655 cycles:u # 3.371 GHz (75.03%) - 8,772,954 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.00%) - 1,785,867,944 stalled-cycles-backend:u # 20.96% backend cycles idle (75.00%) - 24,078,135,749 instructions:u # 2.83 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 2.531160319 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) + 8,961,614,258 cycles # 2.906 GHz + 24,226,315,758 instructions # 2.70 insn per cycle + 3.085434765 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.837288e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.458636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.458636e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.518184 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.613394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.100134e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.100134e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.976346 sec +INFO: No Floating Point Exceptions have been reported + 5,394,338,439 cycles # 2.724 GHz + 11,277,527,499 instructions # 2.09 insn per cycle + 1.981499886 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.276948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.897611e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.778784 sec INFO: No Floating Point Exceptions have been reported - 5,021,965,868 cycles:u # 3.283 GHz (74.90%) - 9,276,712 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.90%) - 1,434,668,332 stalled-cycles-backend:u # 28.57% backend cycles idle (74.92%) - 11,433,674,990 instructions:u # 2.28 insn per cycle - # 0.13 stalled cycles per insn (74.92%) - 1.533115849 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) + 4,855,499,941 cycles # 2.723 GHz + 10,526,571,188 instructions # 2.17 insn per cycle + 1.784170390 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.815864e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.036087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.036087e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.856419 sec +INFO: No Floating Point Exceptions have been reported + 5,199,981,370 cycles # 1.818 GHz + 7,603,665,117 instructions # 1.46 insn per cycle + 2.861804972 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 7f1debf5d0..1d76304278 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_15:56:16 +DATE: 2024-05-16_14:36:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.630658e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.921972e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.978316e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.490325 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.208651e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184994e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.280716e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.522551 sec INFO: No Floating Point Exceptions have been reported - 925,231,414 cycles:u # 2.333 GHz (73.12%) - 2,199,272 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.79%) - 6,955,777 stalled-cycles-backend:u # 0.75% backend cycles idle (75.58%) - 1,431,076,098 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (76.45%) - 0.587960415 seconds time elapsed + 2,145,230,616 cycles # 2.840 GHz + 3,093,123,772 instructions # 1.44 insn per cycle + 0.812278354 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028807e+00 -Avg ME (F77/GPU) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/GPU) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.443865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.503935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.503935e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.471550 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.021911e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079930e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079930e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.294031 sec INFO: No Floating Point Exceptions have been reported - 15,422,902,750 cycles:u # 3.440 GHz (74.93%) - 9,272,160 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 14,558,934 stalled-cycles-backend:u # 0.09% backend cycles idle (75.02%) - 40,045,851,231 instructions:u # 2.60 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 4.516805360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) + 15,341,153,400 cycles # 2.896 GHz + 40,370,282,827 instructions # 2.63 insn per cycle + 5.299425936 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.546865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.775676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.775676e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.484891 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.710012e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.926494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.926494e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.935201 sec INFO: No Floating Point Exceptions have been reported - 8,423,422,341 cycles:u # 3.374 GHz (75.02%) - 9,885,842 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) - 487,638,502 stalled-cycles-backend:u # 5.79% backend cycles idle (75.01%) - 23,470,310,317 instructions:u # 2.79 insn per cycle - # 0.02 stalled cycles per insn (75.01%) - 2.500153281 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) + 8,515,314,447 cycles # 2.897 GHz + 23,253,613,819 instructions # 2.73 insn per cycle + 2.940392108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.856680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.326975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.326975e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.710010 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.780066e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.132607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.132607e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.302191 sec +INFO: No Floating Point Exceptions have been reported + 6,262,262,467 cycles # 2.715 GHz + 12,962,490,062 instructions # 2.07 insn per cycle + 2.307689771 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.109643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.511847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.511847e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.160493 sec INFO: No Floating Point Exceptions have been reported - 5,696,119,318 cycles:u # 3.308 GHz (74.96%) - 10,526,837 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.91%) - 754,967,475 stalled-cycles-backend:u # 13.25% backend cycles idle (74.69%) - 13,094,009,159 instructions:u # 2.30 insn per cycle - # 0.06 stalled cycles per insn (74.70%) - 1.725421092 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) + 5,903,466,716 cycles # 2.727 GHz + 12,238,680,442 instructions # 2.07 insn per cycle + 2.165768560 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.507940e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.694154e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.694154e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.097205 sec +INFO: No Floating Point Exceptions have been reported + 5,614,268,818 cycles # 1.810 GHz + 8,744,074,840 instructions # 1.56 insn per cycle + 3.102417520 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 567bc7e65f..1d7490861d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:56:30 +DATE: 2024-05-16_14:37:19 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.884650e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.028375e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.038050e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.405630 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.992211e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.047041e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061161e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.469322 sec INFO: No Floating Point Exceptions have been reported - 1,063,751,778 cycles:u # 2.576 GHz (74.35%) - 2,189,281 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.68%) - 6,058,350 stalled-cycles-backend:u # 0.57% backend cycles idle (74.88%) - 1,515,047,204 instructions:u # 1.42 insn per cycle - # 0.00 stalled cycles per insn (75.39%) - 0.452449879 seconds time elapsed + 1,970,950,644 cycles # 2.853 GHz + 2,836,233,202 instructions # 1.44 insn per cycle + 0.747868437 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.620723e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.845137e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850867e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.627982 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.129686e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.329949e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.341716e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607217 sec INFO: No Floating Point Exceptions have been reported - 1,715,002,123 cycles:u # 2.678 GHz (74.90%) - 2,164,287 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.04%) - 6,185,236 stalled-cycles-backend:u # 0.36% backend cycles idle (75.23%) - 2,009,201,156 instructions:u # 1.17 insn per cycle - # 0.00 stalled cycles per insn (75.18%) - 0.679343751 seconds time elapsed + 2,397,125,482 cycles # 2.825 GHz + 3,658,262,516 instructions # 1.53 insn per cycle + 0.909559944 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.946588e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.959084e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.959084e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.588844 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.379379e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.391311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.391311e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.910347 sec INFO: No Floating Point Exceptions have been reported - 19,579,106,412 cycles:u # 3.501 GHz (74.97%) - 2,543,884 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 3,124,241,892 stalled-cycles-backend:u # 15.96% backend cycles idle (74.97%) - 57,877,052,779 instructions:u # 2.96 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 5.602333036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) + 19,789,020,586 cycles # 2.863 GHz + 59,609,829,111 instructions # 3.01 insn per cycle + 6.914699001 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.009169e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.060205e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.060205e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.754225 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.619966e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.665049e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.665049e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.569460 sec INFO: No Floating Point Exceptions have been reported - 9,651,465,356 cycles:u # 3.499 GHz (74.96%) - 2,367,394 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) - 2,659,536,123 stalled-cycles-backend:u # 27.56% backend cycles idle (75.06%) - 29,855,463,034 instructions:u # 3.09 insn per cycle - # 0.09 stalled cycles per insn (75.06%) - 2.791405308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) + 10,374,266,250 cycles # 2.904 GHz + 30,674,256,165 instructions # 2.96 insn per cycle + 3.573646642 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.234641e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.256326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.256326e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.354300 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.120184e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.293257e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.293257e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.820051 sec +INFO: No Floating Point Exceptions have been reported + 4,901,380,147 cycles # 2.688 GHz + 11,019,047,598 instructions # 2.25 insn per cycle + 1.824311195 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.028182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.049956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049956e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.616748 sec INFO: No Floating Point Exceptions have been reported - 4,735,250,703 cycles:u # 3.487 GHz (74.72%) - 2,168,552 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.67%) - 1,564,370,787 stalled-cycles-backend:u # 33.04% backend cycles idle (74.88%) - 11,163,033,039 instructions:u # 2.36 insn per cycle - # 0.14 stalled cycles per insn (75.18%) - 1.361230827 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) + 4,378,615,331 cycles # 2.702 GHz + 10,296,117,856 instructions # 2.35 insn per cycle + 1.621129053 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.954224e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.056280e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.056280e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.381742 sec +INFO: No Floating Point Exceptions have been reported + 4,108,596,097 cycles # 1.723 GHz + 5,842,404,115 instructions # 1.42 insn per cycle + 2.385936782 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index d11df66a82..45a1ef164b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_16:24:53 +DATE: 2024-05-16_15:01:40 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.482644e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.002478e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.002478e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.562760 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.535443e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.780857e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.780857e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.503670 sec INFO: No Floating Point Exceptions have been reported - 1,686,186,552 cycles:u # 2.877 GHz (73.70%) - 6,513,601 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.41%) - 260,016,996 stalled-cycles-backend:u # 15.42% backend cycles idle (75.49%) - 2,042,535,938 instructions:u # 1.21 insn per cycle - # 0.13 stalled cycles per insn (75.44%) - 0.609090374 seconds time elapsed + 2,012,376,201 cycles # 2.812 GHz + 3,006,218,540 instructions # 1.49 insn per cycle + 0.774572160 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.201563e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.677497e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.677497e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.249440 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.606024e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.624765e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.624765e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.841754 sec INFO: No Floating Point Exceptions have been reported - 3,843,824,840 cycles:u # 2.991 GHz (75.13%) - 16,951,489 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.11%) - 848,341,874 stalled-cycles-backend:u # 22.07% backend cycles idle (75.11%) - 3,944,599,588 instructions:u # 1.03 insn per cycle - # 0.22 stalled cycles per insn (75.18%) - 1.305562838 seconds time elapsed + 3,099,668,806 cycles # 2.832 GHz + 4,993,276,525 instructions # 1.61 insn per cycle + 1.155254157 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.964088e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.976611e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.976611e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.559066 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.380068e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.392068e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.392068e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.915910 sec INFO: No Floating Point Exceptions have been reported - 19,461,797,230 cycles:u # 3.499 GHz (74.98%) - 2,818,968 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 3,127,759,396 stalled-cycles-backend:u # 16.07% backend cycles idle (74.98%) - 57,912,281,008 instructions:u # 2.98 insn per cycle - # 0.05 stalled cycles per insn (74.96%) - 5.566514489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) + 19,806,579,322 cycles # 2.863 GHz + 59,611,012,266 instructions # 3.01 insn per cycle + 6.920308116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.999739e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.050012e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.050012e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.762614 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.550339e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.594733e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.594733e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.631431 sec INFO: No Floating Point Exceptions have been reported - 9,669,003,072 cycles:u # 3.495 GHz (74.87%) - 2,418,973 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.84%) - 2,494,759,725 stalled-cycles-backend:u # 25.80% backend cycles idle (74.92%) - 29,924,636,825 instructions:u # 3.09 insn per cycle - # 0.08 stalled cycles per insn (75.06%) - 2.770246519 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) + 10,404,134,292 cycles # 2.862 GHz + 30,722,305,980 instructions # 2.95 insn per cycle + 3.635916319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.232362e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.253478e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.253478e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.361317 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.991824e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.166141e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.166141e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.853778 sec INFO: No Floating Point Exceptions have been reported - 4,748,695,704 cycles:u # 3.479 GHz (74.81%) - 2,140,156 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.80%) - 1,563,158,568 stalled-cycles-backend:u # 32.92% backend cycles idle (74.80%) - 11,241,987,888 instructions:u # 2.37 insn per cycle - # 0.14 stalled cycles per insn (74.97%) - 1.368590983 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) + 4,943,570,309 cycles # 2.661 GHz + 11,067,752,215 instructions # 2.24 insn per cycle + 1.858370590 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.005140e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.026682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.026682e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.662867 sec +INFO: No Floating Point Exceptions have been reported + 4,426,260,539 cycles # 2.656 GHz + 10,346,882,831 instructions # 2.34 insn per cycle + 1.667431238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.832038e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.932754e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.932754e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.431611 sec +INFO: No Floating Point Exceptions have been reported + 4,145,808,516 cycles # 1.702 GHz + 5,880,428,508 instructions # 1.42 insn per cycle + 2.436095886 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 73d3d969fb..c8d4c1d012 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:56:45 +DATE: 2024-05-16_14:37:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.879539e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.012020e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.018976e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.387250 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.984938e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044546e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056865e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.468609 sec INFO: No Floating Point Exceptions have been reported - 1,077,578,787 cycles:u # 2.612 GHz (74.03%) - 2,237,257 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.68%) - 5,834,748 stalled-cycles-backend:u # 0.54% backend cycles idle (76.42%) - 1,512,857,332 instructions:u # 1.40 insn per cycle - # 0.00 stalled cycles per insn (75.68%) - 0.432689568 seconds time elapsed + 1,981,002,182 cycles # 2.846 GHz + 2,842,945,772 instructions # 1.44 insn per cycle + 0.752497111 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.607257e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.809635e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.814736e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.628290 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.119070e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315352e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326681e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.601408 sec INFO: No Floating Point Exceptions have been reported - 1,711,877,158 cycles:u # 2.685 GHz (75.48%) - 2,204,155 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.02%) - 5,336,451 stalled-cycles-backend:u # 0.31% backend cycles idle (74.92%) - 2,025,677,427 instructions:u # 1.18 insn per cycle - # 0.00 stalled cycles per insn (75.16%) - 0.680661908 seconds time elapsed + 2,383,936,937 cycles # 2.851 GHz + 3,651,729,049 instructions # 1.53 insn per cycle + 0.896728355 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/GPU) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.953489e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.965956e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.965956e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.575302 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.454763e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.467389e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.467389e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.698212 sec INFO: No Floating Point Exceptions have been reported - 19,552,368,610 cycles:u # 3.505 GHz (74.89%) - 2,660,305 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 3,658,274,650 stalled-cycles-backend:u # 18.71% backend cycles idle (75.05%) - 57,688,497,038 instructions:u # 2.95 insn per cycle - # 0.06 stalled cycles per insn (75.05%) - 5.582548277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) + 19,500,935,732 cycles # 2.911 GHz + 58,799,003,967 instructions # 3.02 insn per cycle + 6.702449206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.007612e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.058203e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.058203e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.754703 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.669930e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.715854e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.715854e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.531511 sec INFO: No Floating Point Exceptions have been reported - 9,650,646,311 cycles:u # 3.499 GHz (74.94%) - 2,305,687 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) - 2,388,323,649 stalled-cycles-backend:u # 24.75% backend cycles idle (75.06%) - 30,244,173,100 instructions:u # 3.13 insn per cycle - # 0.08 stalled cycles per insn (75.06%) - 2.761973696 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) + 10,228,095,464 cycles # 2.894 GHz + 30,347,180,891 instructions # 2.97 insn per cycle + 3.535798492 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.217896e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.238537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238537e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.372470 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.789972e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.950829e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.950829e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.887432 sec +INFO: No Floating Point Exceptions have been reported + 5,055,118,079 cycles # 2.674 GHz + 11,484,444,983 instructions # 2.27 insn per cycle + 1.891612421 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.667837e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.860484e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.860484e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.718788 sec INFO: No Floating Point Exceptions have been reported - 4,794,762,813 cycles:u # 3.484 GHz (75.01%) - 2,118,382 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 1,520,014,939 stalled-cycles-backend:u # 31.70% backend cycles idle (75.00%) - 11,609,689,557 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (75.00%) - 1.379533501 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) + 4,655,858,880 cycles # 2.704 GHz + 10,842,096,596 instructions # 2.33 insn per cycle + 1.722993406 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.981237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.082937e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.082937e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.372571 sec +INFO: No Floating Point Exceptions have been reported + 4,129,142,877 cycles # 1.738 GHz + 6,106,185,085 instructions # 1.48 insn per cycle + 2.376879303 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e1a0e3c9a4..e4bc7cf2cc 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:57:01 +DATE: 2024-05-16_14:38:10 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.289518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.830945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.927784e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.322739 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.514552e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271085e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366020e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.450662 sec INFO: No Floating Point Exceptions have been reported - 837,567,563 cycles:u # 2.417 GHz (73.98%) - 2,164,839 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.28%) - 5,659,093 stalled-cycles-backend:u # 0.68% backend cycles idle (74.69%) - 1,355,394,082 instructions:u # 1.62 insn per cycle - # 0.00 stalled cycles per insn (75.11%) - 0.368551297 seconds time elapsed + 1,888,418,045 cycles # 2.834 GHz + 2,686,004,303 instructions # 1.42 insn per cycle + 0.722549365 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.368084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.629248e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.635221e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.435662 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.424662e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.459806e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.527254e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.495261 sec INFO: No Floating Point Exceptions have been reported - 1,147,824,263 cycles:u # 2.495 GHz (75.41%) - 2,016,822 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.37%) - 5,689,349 stalled-cycles-backend:u # 0.50% backend cycles idle (75.60%) - 1,589,336,679 instructions:u # 1.38 insn per cycle - # 0.00 stalled cycles per insn (75.69%) - 0.483291874 seconds time elapsed + 2,099,817,827 cycles # 2.862 GHz + 2,990,738,948 instructions # 1.42 insn per cycle + 0.790419941 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 1.412404e+00 -Avg ME (F77/GPU) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412608e+00 +Avg ME (F77/GPU) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.230883e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246043e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.246043e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.096868 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.505220e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.518346e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.518346e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.562288 sec INFO: No Floating Point Exceptions have been reported - 17,874,399,162 cycles:u # 3.505 GHz (74.91%) - 2,364,613 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 3,161,727,640 stalled-cycles-backend:u # 17.69% backend cycles idle (75.02%) - 55,161,219,476 instructions:u # 3.09 insn per cycle - # 0.06 stalled cycles per insn (75.06%) - 5.103969117 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) + 19,080,957,547 cycles # 2.906 GHz + 58,959,648,789 instructions # 3.09 insn per cycle + 6.566573323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.073966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090872e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.550666 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.204155e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.352745e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.352745e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 2.018056 sec INFO: No Floating Point Exceptions have been reported - 5,423,702,647 cycles:u # 3.490 GHz (74.81%) - 2,003,040 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.78%) - 1,720,289,419 stalled-cycles-backend:u # 31.72% backend cycles idle (74.78%) - 16,115,970,116 instructions:u # 2.97 insn per cycle - # 0.11 stalled cycles per insn (74.98%) - 1.557468987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) + 5,861,245,947 cycles # 2.899 GHz + 16,693,370,121 instructions # 2.85 insn per cycle + 2.022246601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.373466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.454420e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.454420e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.715408 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.747206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.811751e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.811751e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.959718 sec +INFO: No Floating Point Exceptions have been reported + 2,597,973,759 cycles # 2.697 GHz + 5,979,816,432 instructions # 2.30 insn per cycle + 0.963957244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.928786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.008064e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.008064e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.871454 sec INFO: No Floating Point Exceptions have been reported - 2,492,743,461 cycles:u # 3.468 GHz (74.42%) - 2,007,007 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.44%) - 808,406,617 stalled-cycles-backend:u # 32.43% backend cycles idle (74.94%) - 6,038,138,625 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (75.46%) - 0.722468412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) + 2,346,801,151 cycles # 2.682 GHz + 5,601,970,539 instructions # 2.39 insn per cycle + 0.875813732 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.412327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455439e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.184240 sec +INFO: No Floating Point Exceptions have been reported + 2,059,493,323 cycles # 1.734 GHz + 3,333,364,881 instructions # 1.62 insn per cycle + 1.188531798 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 30cd57abf1..d735dc5897 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_16:25:09 +DATE: 2024-05-16_15:02:06 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.247681e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.554030e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.554030e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 -TOTAL : 0.531278 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.750186e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.085490e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.085490e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.469338 sec INFO: No Floating Point Exceptions have been reported - 1,475,648,505 cycles:u # 2.845 GHz (72.90%) - 7,438,312 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.29%) - 262,612,701 stalled-cycles-backend:u # 17.80% backend cycles idle (75.30%) - 1,892,172,461 instructions:u # 1.28 insn per cycle - # 0.14 stalled cycles per insn (75.93%) - 0.576464032 seconds time elapsed + 1,918,362,944 cycles # 2.804 GHz + 2,834,169,916 instructions # 1.48 insn per cycle + 0.742178075 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.105687e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.477890e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.477890e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 -TOTAL : 1.064579 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.524122e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.570005e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570005e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.651816 sec INFO: No Floating Point Exceptions have been reported - 3,242,332,593 cycles:u # 2.967 GHz (75.24%) - 17,743,806 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.40%) - 857,947,060 stalled-cycles-backend:u # 26.46% backend cycles idle (74.83%) - 3,425,579,084 instructions:u # 1.06 insn per cycle - # 0.25 stalled cycles per insn (74.75%) - 1.113753694 seconds time elapsed + 2,503,160,784 cycles # 2.822 GHz + 3,832,792,162 instructions # 1.53 insn per cycle + 0.943470239 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 1.412404e+00 -Avg ME (F77/GPU) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412608e+00 +Avg ME (F77/GPU) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.226318e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.241649e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.241649e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.106439 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.465694e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.479110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.479110e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.671662 sec INFO: No Floating Point Exceptions have been reported - 17,885,648,370 cycles:u # 3.500 GHz (74.95%) - 2,924,907 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 3,189,031,686 stalled-cycles-backend:u # 17.83% backend cycles idle (75.02%) - 55,181,031,468 instructions:u # 3.09 insn per cycle - # 0.06 stalled cycles per insn (75.10%) - 5.384687058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) + 19,108,337,453 cycles # 2.863 GHz + 58,967,331,894 instructions # 3.09 insn per cycle + 6.675976597 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.072067e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088913e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088913e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.556450 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.093089e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.238027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.238027e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 2.051178 sec INFO: No Floating Point Exceptions have been reported - 5,440,000,446 cycles:u # 3.487 GHz (74.89%) - 2,280,613 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) - 1,707,699,385 stalled-cycles-backend:u # 31.39% backend cycles idle (74.88%) - 16,135,473,210 instructions:u # 2.97 insn per cycle - # 0.11 stalled cycles per insn (74.88%) - 1.563502548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) + 5,880,119,320 cycles # 2.862 GHz + 16,741,679,626 instructions # 2.85 insn per cycle + 2.055508197 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.369755e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450486e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.719054 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.718905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782305e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.980043 sec INFO: No Floating Point Exceptions have been reported - 2,505,149,993 cycles:u # 3.467 GHz (74.65%) - 1,790,711 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.54%) - 809,007,986 stalled-cycles-backend:u # 32.29% backend cycles idle (74.54%) - 6,116,780,442 instructions:u # 2.44 insn per cycle - # 0.13 stalled cycles per insn (74.86%) - 0.726331866 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) + 2,616,418,693 cycles # 2.660 GHz + 6,017,096,104 instructions # 2.30 insn per cycle + 0.984343134 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.912882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.991175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.991175e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.883189 sec +INFO: No Floating Point Exceptions have been reported + 2,365,822,002 cycles # 2.667 GHz + 5,638,771,692 instructions # 2.38 insn per cycle + 0.887626463 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.399129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.441231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.441231e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.200076 sec +INFO: No Floating Point Exceptions have been reported + 2,081,452,605 cycles # 1.729 GHz + 3,374,965,036 instructions # 1.62 insn per cycle + 1.204429196 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 1bd08114fb..3d41e21b12 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:57:14 +DATE: 2024-05-16_14:38:31 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.210766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.663281e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.764884e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.322105 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.548366e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.290418e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382374e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.453301 sec INFO: No Floating Point Exceptions have been reported - 840,750,027 cycles:u # 2.432 GHz (73.98%) - 2,158,503 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.63%) - 6,103,258 stalled-cycles-backend:u # 0.73% backend cycles idle (75.04%) - 1,324,693,507 instructions:u # 1.58 insn per cycle - # 0.00 stalled cycles per insn (75.71%) - 0.368242227 seconds time elapsed + 1,884,361,235 cycles # 2.811 GHz + 2,662,129,036 instructions # 1.41 insn per cycle + 0.727401829 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.424369e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.692214e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.697518e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.433909 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.381856e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.386346e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.451907e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.498921 sec INFO: No Floating Point Exceptions have been reported - 1,150,364,539 cycles:u # 2.502 GHz (75.49%) - 2,023,314 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.62%) - 4,483,319 stalled-cycles-backend:u # 0.39% backend cycles idle (76.09%) - 1,521,066,999 instructions:u # 1.32 insn per cycle - # 0.00 stalled cycles per insn (75.31%) - 0.483317914 seconds time elapsed + 2,065,776,106 cycles # 2.820 GHz + 3,002,526,593 instructions # 1.45 insn per cycle + 0.789720140 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 1.412404e+00 -Avg ME (F77/GPU) = 1.4131669531526541 -Relative difference = 0.0005401805380429868 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.412608e+00 +Avg ME (F77/GPU) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.256543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271956e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271956e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.056864 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.479714e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.492704e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.492704e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.641350 sec INFO: No Floating Point Exceptions have been reported - 17,712,188,734 cycles:u # 3.500 GHz (75.02%) - 2,582,868 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 2,714,112,309 stalled-cycles-backend:u # 15.32% backend cycles idle (75.02%) - 54,909,074,386 instructions:u # 3.10 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 5.063913868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) + 18,978,826,784 cycles # 2.861 GHz + 58,704,221,037 instructions # 3.09 insn per cycle + 6.645410970 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.127879e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.146554e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.146554e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.477722 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.494310e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651898e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651898e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.950028 sec INFO: No Floating Point Exceptions have been reported - 5,183,480,838 cycles:u # 3.500 GHz (74.69%) - 1,952,096 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) - 1,427,331,531 stalled-cycles-backend:u # 27.54% backend cycles idle (75.16%) - 16,162,972,824 instructions:u # 3.12 insn per cycle - # 0.09 stalled cycles per insn (75.16%) - 1.484737821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) + 5,589,974,968 cycles # 2.862 GHz + 16,510,304,699 instructions # 2.95 insn per cycle + 1.954264273 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857712652836 -Relative difference = 1.618803841657786e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.125930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.190715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.190715e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.795572 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.496639e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.543532e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.543532e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.116418 sec +INFO: No Floating Point Exceptions have been reported + 2,975,820,242 cycles # 2.657 GHz + 6,633,799,194 instructions # 2.23 insn per cycle + 1.120575232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.615016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.669374e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.669374e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.036246 sec INFO: No Floating Point Exceptions have been reported - 2,763,774,994 cycles:u # 3.459 GHz (74.97%) - 2,203,313 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) - 785,450,570 stalled-cycles-backend:u # 28.42% backend cycles idle (74.97%) - 6,662,264,762 instructions:u # 2.41 insn per cycle - # 0.12 stalled cycles per insn (74.97%) - 0.802444349 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) + 2,759,204,529 cycles # 2.654 GHz + 6,255,102,481 instructions # 2.27 insn per cycle + 1.040401186 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.286831e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322123e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322123e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.300128 sec +INFO: No Floating Point Exceptions have been reported + 2,231,395,652 cycles # 1.715 GHz + 3,699,704,768 instructions # 1.66 insn per cycle + 1.304305216 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2391) (512y: 29) (512z: 3970) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 4627425d79..18990368c8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:57:26 +DATE: 2024-05-16_14:38:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.887775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.027262e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036364e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.387780 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.980776e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.047318e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059891e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.469526 sec INFO: No Floating Point Exceptions have been reported - 1,049,027,669 cycles:u # 2.546 GHz (73.95%) - 2,134,474 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.14%) - 5,844,380 stalled-cycles-backend:u # 0.56% backend cycles idle (74.94%) - 1,528,986,593 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (74.83%) - 0.432775843 seconds time elapsed + 1,950,532,568 cycles # 2.815 GHz + 2,802,706,395 instructions # 1.44 insn per cycle + 0.749158155 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.635615e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.843712e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.849442e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.609776 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.120585e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.317479e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329114e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.606344 sec INFO: No Floating Point Exceptions have been reported - 1,721,119,508 cycles:u # 2.687 GHz (74.86%) - 2,110,336 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.04%) - 6,031,696 stalled-cycles-backend:u # 0.35% backend cycles idle (75.30%) - 2,015,973,548 instructions:u # 1.17 insn per cycle - # 0.00 stalled cycles per insn (75.20%) - 0.663204305 seconds time elapsed + 2,403,151,636 cycles # 2.824 GHz + 3,669,339,361 instructions # 1.53 insn per cycle + 0.910110717 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/GPU) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.871835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.883506e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.883506e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.733120 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.348054e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359694e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359694e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 7.005029 sec INFO: No Floating Point Exceptions have been reported - 20,114,351,158 cycles:u # 3.506 GHz (74.93%) - 2,517,021 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 3,854,050,442 stalled-cycles-backend:u # 19.16% backend cycles idle (75.04%) - 59,072,951,526 instructions:u # 2.94 insn per cycle - # 0.07 stalled cycles per insn (75.04%) - 5.740328179 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) + 20,055,951,018 cycles # 2.863 GHz + 60,536,467,053 instructions # 3.02 insn per cycle + 7.009312607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.087732e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.139458e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.139458e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.718917 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.638770e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.684822e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.684822e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.556206 sec INFO: No Floating Point Exceptions have been reported - 9,530,401,613 cycles:u # 3.501 GHz (74.97%) - 2,075,703 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.03%) - 2,354,560,620 stalled-cycles-backend:u # 24.71% backend cycles idle (75.03%) - 29,677,385,435 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.02%) - 2.726176913 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) + 10,186,602,629 cycles # 2.862 GHz + 30,386,009,701 instructions # 2.98 insn per cycle + 3.560429335 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.251275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.273000e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273000e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.336639 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.050822e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223334e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223334e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.833811 sec +INFO: No Floating Point Exceptions have been reported + 4,877,548,863 cycles # 2.655 GHz + 10,978,535,397 instructions # 2.25 insn per cycle + 1.838126466 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.034701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.056812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056812e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.607005 sec INFO: No Floating Point Exceptions have been reported - 4,666,727,671 cycles:u # 3.482 GHz (74.93%) - 2,106,240 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.93%) - 1,554,204,116 stalled-cycles-backend:u # 33.30% backend cycles idle (74.93%) - 11,146,255,994 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (74.93%) - 1.343598832 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) + 4,285,859,041 cycles # 2.661 GHz + 10,248,085,853 instructions # 2.39 insn per cycle + 1.611327735 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.675038e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.769490e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769490e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.480681 sec +INFO: No Floating Point Exceptions have been reported + 4,211,204,679 cycles # 1.695 GHz + 6,044,041,090 instructions # 1.44 insn per cycle + 2.485018889 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 7a8ec93c4e..bea6b18082 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-05-15_15:57:42 +DATE: 2024-05-16_14:39:18 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.873499e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.008721e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.015293e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.387103 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.940348e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041869e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054764e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.469154 sec INFO: No Floating Point Exceptions have been reported - 1,056,954,597 cycles:u # 2.564 GHz (74.08%) - 2,198,078 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.17%) - 5,105,476 stalled-cycles-backend:u # 0.48% backend cycles idle (75.00%) - 1,563,651,790 instructions:u # 1.48 insn per cycle - # 0.00 stalled cycles per insn (74.84%) - 0.432479829 seconds time elapsed + 1,946,414,728 cycles # 2.818 GHz + 2,803,423,086 instructions # 1.44 insn per cycle + 0.748059256 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.632889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.835986e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.841678e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.605876 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.116866e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312173e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323463e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.604157 sec INFO: No Floating Point Exceptions have been reported - 1,702,422,059 cycles:u # 2.689 GHz (74.54%) - 2,184,475 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.76%) - 5,598,750 stalled-cycles-backend:u # 0.33% backend cycles idle (75.03%) - 2,004,802,493 instructions:u # 1.18 insn per cycle - # 0.00 stalled cycles per insn (75.83%) - 0.658456690 seconds time elapsed + 2,374,249,289 cycles # 2.818 GHz + 3,602,148,119 instructions # 1.52 insn per cycle + 0.902621411 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.413122e+00 -Avg ME (F77/GPU) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/GPU) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.906678e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.918594e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.918594e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.664488 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.368504e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.380280e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.380280e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.941323 sec INFO: No Floating Point Exceptions have been reported - 19,869,141,415 cycles:u # 3.506 GHz (74.95%) - 2,403,537 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 3,922,827,934 stalled-cycles-backend:u # 19.74% backend cycles idle (75.02%) - 58,608,353,930 instructions:u # 2.95 insn per cycle - # 0.07 stalled cycles per insn (75.02%) - 5.671993758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) + 19,878,296,626 cycles # 2.863 GHz + 59,936,362,271 instructions # 3.02 insn per cycle + 6.945573140 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.271939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.327248e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.327248e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.639703 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.689994e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736297e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736297e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.516340 sec INFO: No Floating Point Exceptions have been reported - 9,236,580,182 cycles:u # 3.494 GHz (74.89%) - 2,244,953 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) - 1,810,719,115 stalled-cycles-backend:u # 19.60% backend cycles idle (74.89%) - 30,142,890,449 instructions:u # 3.26 insn per cycle - # 0.06 stalled cycles per insn (75.01%) - 2.647015728 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) + 10,077,314,757 cycles # 2.863 GHz + 30,098,117,657 instructions # 2.99 insn per cycle + 3.520635536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.228737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.249673e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.249673e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.360409 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.778247e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.940877e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.940877e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.889938 sec +INFO: No Floating Point Exceptions have been reported + 5,023,754,472 cycles # 2.654 GHz + 11,483,522,538 instructions # 2.29 insn per cycle + 1.894205310 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.644687e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.842226e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.842226e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.722080 sec INFO: No Floating Point Exceptions have been reported - 4,751,043,404 cycles:u # 3.483 GHz (74.78%) - 2,034,803 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.78%) - 1,525,164,530 stalled-cycles-backend:u # 32.10% backend cycles idle (74.73%) - 11,644,210,563 instructions:u # 2.45 insn per cycle - # 0.13 stalled cycles per insn (75.02%) - 1.367338479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) + 4,590,091,342 cycles # 2.660 GHz + 10,809,457,257 instructions # 2.35 insn per cycle + 1.726406566 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.641517e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.735645e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.735645e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.492729 sec +INFO: No Floating Point Exceptions have been reported + 4,229,101,372 cycles # 1.695 GHz + 6,273,394,761 instructions # 1.48 insn per cycle + 2.496999493 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 8dd5bdd130..adf6424639 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:57:57 +DATE: 2024-05-16_14:39:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.352876e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.532511e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.533908e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.658527 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.453895e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.477096e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.479397e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529744 sec INFO: No Floating Point Exceptions have been reported - 1,888,638,728 cycles:u # 2.889 GHz (75.48%) - 2,109,824 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.55%) - 6,355,443 stalled-cycles-backend:u # 0.34% backend cycles idle (75.55%) - 2,112,081,809 instructions:u # 1.12 insn per cycle - # 0.00 stalled cycles per insn (75.28%) - 0.704110953 seconds time elapsed + 2,179,317,048 cycles # 2.822 GHz + 3,403,036,461 instructions # 1.56 insn per cycle + 0.830470867 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.241230e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.243695e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.243758e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.697250 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.124157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.151338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.152519e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.042150 sec INFO: No Floating Point Exceptions have been reported - 26,573,207,605 cycles:u # 3.439 GHz (74.91%) - 3,155,326 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 6,300,281 stalled-cycles-backend:u # 0.02% backend cycles idle (75.05%) - 21,102,057,691 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 7.750974681 seconds time elapsed + 9,405,604,432 cycles # 2.853 GHz + 20,118,562,201 instructions # 2.14 insn per cycle + 3.353608047 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.204102e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204965e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204965e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.452461 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.820592e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821434e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821434e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.018372 sec INFO: No Floating Point Exceptions have been reported - 26,115,820,847 cycles:u # 3.503 GHz (75.00%) - 31,665,677 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.00%) - 2,969,259,653 stalled-cycles-backend:u # 11.37% backend cycles idle (75.00%) - 81,648,884,549 instructions:u # 3.13 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.459873901 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) + 25,614,013,948 cycles # 2.839 GHz + 78,938,013,495 instructions # 3.08 insn per cycle + 9.022664733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.040487e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.045133e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.045133e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.265295 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.519494e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.522699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.522699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.669138 sec INFO: No Floating Point Exceptions have been reported - 11,438,272,299 cycles:u # 3.499 GHz (75.01%) - 1,570,411 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 1,338,708,154 stalled-cycles-backend:u # 11.70% backend cycles idle (75.04%) - 39,146,792,626 instructions:u # 3.42 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 3.272705628 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) + 12,898,966,245 cycles # 2.761 GHz + 39,280,150,365 instructions # 3.05 insn per cycle + 4.673492352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.202724e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205315e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205315e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.375160 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.859599e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875346e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875346e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.097013 sec +INFO: No Floating Point Exceptions have been reported + 5,574,685,577 cycles # 2.655 GHz + 13,685,856,406 instructions # 2.46 insn per cycle + 2.101249976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.915800e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.935807e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.935807e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.848754 sec INFO: No Floating Point Exceptions have been reported - 4,819,692,655 cycles:u # 3.496 GHz (74.94%) - 848,155 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.05%) - 532,717,972 stalled-cycles-backend:u # 11.05% backend cycles idle (75.05%) - 13,721,077,924 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 1.383011608 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) + 4,887,101,603 cycles # 2.639 GHz + 12,341,123,817 instructions # 2.53 insn per cycle + 1.853060894 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.728417e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.739729e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.739729e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.446881 sec +INFO: No Floating Point Exceptions have been reported + 4,107,098,137 cycles # 1.676 GHz + 6,336,202,498 instructions # 1.54 insn per cycle + 2.451096147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 720d8ad564..92636e2555 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:25:33 +DATE: 2024-05-16_15:02:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.357053e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.499082e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.499082e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.652708 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.094987e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.434034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.434034e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.523594 sec INFO: No Floating Point Exceptions have been reported - 1,981,267,905 cycles:u # 2.936 GHz (74.61%) - 2,766,739 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.54%) - 43,257,031 stalled-cycles-backend:u # 2.18% backend cycles idle (74.53%) - 2,220,306,469 instructions:u # 1.12 insn per cycle - # 0.02 stalled cycles per insn (75.07%) - 0.699482547 seconds time elapsed + 2,118,517,608 cycles # 2.813 GHz + 3,348,276,596 instructions # 1.58 insn per cycle + 0.813391390 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.210119e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244985e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244985e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.530658 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.622834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.121853e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.121853e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.329713 sec INFO: No Floating Point Exceptions have been reported - 29,318,532,360 cycles:u # 3.422 GHz (74.97%) - 22,948,494 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) - 1,130,187,974 stalled-cycles-backend:u # 3.85% backend cycles idle (74.98%) - 23,588,060,043 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 8.587592843 seconds time elapsed + 10,291,111,145 cycles # 2.854 GHz + 21,714,903,322 instructions # 2.11 insn per cycle + 3.660758937 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.224704e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.225594e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.225594e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.386700 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.836126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.837051e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837051e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.946025 sec INFO: No Floating Point Exceptions have been reported - 25,885,268,757 cycles:u # 3.503 GHz (75.00%) - 8,436,170 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) - 3,081,829,357 stalled-cycles-backend:u # 11.91% backend cycles idle (75.00%) - 81,646,491,395 instructions:u # 3.15 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.394511398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) + 25,625,027,072 cycles # 2.863 GHz + 78,943,584,564 instructions # 3.08 insn per cycle + 8.950491990 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.041063e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.045600e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.045600e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.268311 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.512313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.515690e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.515690e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.682891 sec INFO: No Floating Point Exceptions have been reported - 11,460,129,702 cycles:u # 3.503 GHz (74.93%) - 714,318 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) - 1,383,473,387 stalled-cycles-backend:u # 12.07% backend cycles idle (75.06%) - 39,145,894,854 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 3.275584699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) + 12,903,818,271 cycles # 2.754 GHz + 39,293,324,950 instructions # 3.05 insn per cycle + 4.687529036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.202223e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204890e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.204890e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.379340 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.867831e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.884189e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.884189e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.098804 sec INFO: No Floating Point Exceptions have been reported - 4,831,323,740 cycles:u # 3.493 GHz (74.85%) - 408,845 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.12%) - 531,131,302 stalled-cycles-backend:u # 10.99% backend cycles idle (75.13%) - 13,717,644,390 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (75.13%) - 1.386770607 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) + 5,587,651,201 cycles # 2.658 GHz + 13,696,262,775 instructions # 2.45 insn per cycle + 2.103410758 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.952196e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.973818e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.973818e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.845912 sec +INFO: No Floating Point Exceptions have been reported + 4,903,860,646 cycles # 2.651 GHz + 12,352,108,328 instructions # 2.52 insn per cycle + 1.850421022 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.711524e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.723541e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.723541e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.457485 sec +INFO: No Floating Point Exceptions have been reported + 4,130,677,154 cycles # 1.678 GHz + 6,346,127,118 instructions # 1.54 insn per cycle + 2.462055019 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 966cc59f5c..07bc3b6c73 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:31:58 +DATE: 2024-05-16_15:12:42 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.349266e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.531209e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.532381e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.490501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.518177e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520849e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.654130 sec +TOTAL : 0.515008 sec INFO: No Floating Point Exceptions have been reported - 1,991,618,871 cycles:u # 2.935 GHz (74.57%) - 2,424,041 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.41%) - 34,189,430 stalled-cycles-backend:u # 1.72% backend cycles idle (75.26%) - 2,163,613,736 instructions:u # 1.09 insn per cycle - # 0.02 stalled cycles per insn (75.35%) - 0.699136584 seconds time elapsed + 2,117,861,647 cycles # 2.847 GHz + 3,355,581,223 instructions # 1.58 insn per cycle + 0.805282012 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.238353e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.241327e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.120060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.152876e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.154244e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.406295 sec +TOTAL : 3.146209 sec INFO: No Floating Point Exceptions have been reported - 28,943,119,889 cycles:u # 3.428 GHz (75.00%) - 11,675,494 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) - 1,122,815,591 stalled-cycles-backend:u # 3.88% backend cycles idle (74.99%) - 22,685,902,737 instructions:u # 0.78 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 8.462672542 seconds time elapsed + 9,794,350,225 cycles # 2.878 GHz + 20,567,996,876 instructions # 2.10 insn per cycle + 3.458179285 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.227440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.228327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.228327e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.854249e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855163e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855163e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.374640 sec +TOTAL : 8.856220 sec INFO: No Floating Point Exceptions have been reported - 25,848,687,597 cycles:u # 3.504 GHz (74.95%) - 3,873,133 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,016,465,510 stalled-cycles-backend:u # 11.67% backend cycles idle (74.98%) - 81,681,062,341 instructions:u # 3.16 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 7.378912231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) + 25,606,958,110 cycles # 2.890 GHz + 78,936,876,492 instructions # 3.08 insn per cycle + 8.860490718 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.047807e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.052416e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.052416e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.547585e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.550823e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.550823e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.260685 sec +TOTAL : 4.633866 sec INFO: No Floating Point Exceptions have been reported - 11,417,527,634 cycles:u # 3.499 GHz (75.00%) - 1,067,583 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 1,390,452,689 stalled-cycles-backend:u # 12.18% backend cycles idle (75.00%) - 39,151,712,810 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 3.264755542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) + 12,886,616,952 cycles # 2.779 GHz + 39,279,548,039 instructions # 3.05 insn per cycle + 4.638052623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.196928e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199480e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199480e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.950793e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.966539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.966539e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.073967 sec +INFO: No Floating Point Exceptions have been reported + 5,577,712,569 cycles # 2.685 GHz + 13,684,498,611 instructions # 2.45 insn per cycle + 2.078154877 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.068596e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.089664e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.089664e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.381758 sec +TOTAL : 1.820316 sec INFO: No Floating Point Exceptions have been reported - 4,845,646,776 cycles:u # 3.500 GHz (74.76%) - 1,894,319 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.05%) - 551,051,710 stalled-cycles-backend:u # 11.37% backend cycles idle (75.16%) - 13,721,268,897 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.385838273 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) + 4,894,997,970 cycles # 2.684 GHz + 12,339,079,686 instructions # 2.52 insn per cycle + 1.824557454 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.817590e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.829323e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.829323e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.417091 sec +INFO: No Floating Point Exceptions have been reported + 4,131,104,953 cycles # 1.707 GHz + 6,332,486,091 instructions # 1.53 insn per cycle + 2.421265188 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index f82b55abd2..0a65f9fefe 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -40,190 +40,190 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_09:27:41 +DATE: 2024-05-16_15:09:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.458587e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.486337e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.488763e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.458490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487802e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.514727 sec +TOTAL : 0.511372 sec INFO: No Floating Point Exceptions have been reported - 2,093,578,737 cycles # 2.818 GHz - 3,271,199,664 instructions # 1.56 insn per cycle - 0.804252228 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst + 2,125,154,213 cycles # 2.818 GHz + 3,305,948,128 instructions # 1.56 insn per cycle + 0.811831996 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.162956e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.196994e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.198404e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.112497e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.145168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146541e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.074292 sec +TOTAL : 3.091906 sec INFO: No Floating Point Exceptions have been reported - 9,539,416,967 cycles # 2.854 GHz - 21,898,452,850 instructions # 2.30 insn per cycle - 3.398658909 seconds time elapsed + 9,555,297,501 cycles # 2.852 GHz + 20,467,928,496 instructions # 2.14 insn per cycle + 3.408325542 seconds time elapsed ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.836687e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837552e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837552e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.835837e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.836698e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.836698e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.938893 sec +TOTAL : 8.943298 sec INFO: No Floating Point Exceptions have been reported - 25,599,669,460 cycles # 2.863 GHz - 78,937,757,667 instructions # 3.08 insn per cycle - 8.943169699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,616,203,937 cycles # 2.864 GHz + 78,941,981,933 instructions # 3.08 insn per cycle + 8.947377666 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.528823e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.532118e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.532118e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.509219e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512397e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.656663 sec +TOTAL : 4.682859 sec INFO: No Floating Point Exceptions have been reported - 12,879,833,669 cycles # 2.764 GHz - 39,279,836,544 instructions # 3.05 insn per cycle - 4.660830472 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 12,889,261,061 cycles # 2.751 GHz + 39,280,374,746 instructions # 3.05 insn per cycle + 4.687212544 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.856358e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.871880e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.871880e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.873155e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.889044e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.889044e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.097129 sec +TOTAL : 2.092539 sec INFO: No Floating Point Exceptions have been reported - 5,576,613,834 cycles # 2.655 GHz - 13,686,586,203 instructions # 2.45 insn per cycle - 2.101309122 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,573,290,015 cycles # 2.659 GHz + 13,685,575,452 instructions # 2.46 insn per cycle + 2.096738730 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.966580e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.986724e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.986724e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.869803e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.890096e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.890096e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.838635 sec +TOTAL : 1.858817 sec INFO: No Floating Point Exceptions have been reported - 4,888,562,247 cycles # 2.654 GHz - 12,341,706,398 instructions # 2.52 insn per cycle - 1.842925870 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,890,594,740 cycles # 2.626 GHz + 12,341,872,390 instructions # 2.52 insn per cycle + 1.863321950 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.714955e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.726732e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.726732e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.721792e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.733746e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.733746e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.451970 sec +TOTAL : 2.449544 sec INFO: No Floating Point Exceptions have been reported - 4,108,647,143 cycles # 1.673 GHz - 6,335,478,400 instructions # 1.54 insn per cycle - 2.456208702 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,111,968,902 cycles # 1.676 GHz + 6,335,563,564 instructions # 1.54 insn per cycle + 2.453951471 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 64d0049a09..b300efd9c0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,195 +1,236 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:29:49 +DATE: 2024-05-16_15:07:12 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.440619e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571586e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572951e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.667982 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.175456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.487401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489887e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.516778 sec INFO: No Floating Point Exceptions have been reported - 1,967,980,387 cycles:u # 2.929 GHz (74.36%) - 2,862,408 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.36%) - 41,016,977 stalled-cycles-backend:u # 2.08% backend cycles idle (75.01%) - 2,153,418,255 instructions:u # 1.09 insn per cycle - # 0.02 stalled cycles per insn (75.12%) - 0.711595964 seconds time elapsed + 2,106,838,284 cycles # 2.817 GHz + 3,334,047,065 instructions # 1.58 insn per cycle + 0.806903831 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.210461e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242483e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.509495 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.725415e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181222e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182613e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.208931 sec INFO: No Floating Point Exceptions have been reported - 29,333,761,340 cycles:u # 3.434 GHz (75.00%) - 23,002,873 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) - 1,133,831,790 stalled-cycles-backend:u # 3.87% backend cycles idle (74.99%) - 23,642,738,991 instructions:u # 0.81 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 8.562224782 seconds time elapsed + 9,884,616,856 cycles # 2.852 GHz + 22,569,706,597 instructions # 2.28 insn per cycle + 3.521271497 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.223091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.223976e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.223976e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.388974 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.838805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.839662e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.839662e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.928794 sec INFO: No Floating Point Exceptions have been reported - 25,907,464,121 cycles:u # 3.505 GHz (75.00%) - 8,486,511 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) - 2,965,472,064 stalled-cycles-backend:u # 11.45% backend cycles idle (75.00%) - 81,629,784,811 instructions:u # 3.15 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.670126649 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) + 25,578,535,475 cycles # 2.864 GHz + 78,941,438,017 instructions # 3.09 insn per cycle + 8.932959256 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.044247e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.048783e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.048783e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.262983 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.490750e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493870e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.493870e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.708141 sec INFO: No Floating Point Exceptions have been reported - 11,431,656,379 cycles:u # 3.500 GHz (75.00%) - 574,795 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 1,405,469,497 stalled-cycles-backend:u # 12.29% backend cycles idle (75.02%) - 39,149,355,532 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.267353644 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) + 12,873,433,154 cycles # 2.733 GHz + 39,280,620,994 instructions # 3.05 insn per cycle + 4.712353785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.204809e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.207400e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.207400e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.372933 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.853620e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.869024e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.869024e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.097850 sec +INFO: No Floating Point Exceptions have been reported + 5,573,477,429 cycles # 2.652 GHz + 13,685,909,410 instructions # 2.46 insn per cycle + 2.102047066 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.970623e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.991129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.991129e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.837793 sec INFO: No Floating Point Exceptions have been reported - 4,797,791,497 cycles:u # 3.488 GHz (75.00%) - 341,385 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 532,304,869 stalled-cycles-backend:u # 11.09% backend cycles idle (75.00%) - 13,721,099,517 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 1.377048855 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) + 4,885,535,539 cycles # 2.653 GHz + 12,340,762,979 instructions # 2.53 insn per cycle + 1.841998870 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.715803e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.727367e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.727367e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.451983 sec +INFO: No Floating Point Exceptions have been reported + 4,110,713,398 cycles # 1.674 GHz + 6,334,867,690 instructions # 1.54 insn per cycle + 2.456147392 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index a84f43f38d..254c65fd8c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:58:24 +DATE: 2024-05-16_14:40:17 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.382317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.439244e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.439750e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.513473 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.472040e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.495257e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497568e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.530193 sec INFO: No Floating Point Exceptions have been reported - 1,502,112,076 cycles:u # 2.810 GHz (74.52%) - 2,253,278 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.12%) - 7,775,573 stalled-cycles-backend:u # 0.52% backend cycles idle (74.58%) - 1,836,916,559 instructions:u # 1.22 insn per cycle - # 0.00 stalled cycles per insn (74.65%) - 0.561281368 seconds time elapsed + 2,179,825,483 cycles # 2.820 GHz + 3,416,926,116 instructions # 1.57 insn per cycle + 0.832303660 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738429e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.743464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.743577e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.335273 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.149957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177471e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178689e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.025570 sec INFO: No Floating Point Exceptions have been reported - 21,796,344,827 cycles:u # 3.425 GHz (74.90%) - 2,923,143 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 6,274,063 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 17,459,408,904 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 6.389762832 seconds time elapsed + 9,343,829,120 cycles # 2.851 GHz + 20,017,847,921 instructions # 2.14 insn per cycle + 3.337093329 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.214028e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.214909e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.214909e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.418926 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.844549e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845438e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845438e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.900941 sec INFO: No Floating Point Exceptions have been reported - 26,004,260,668 cycles:u # 3.504 GHz (75.00%) - 18,242,709 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 2,944,961,444 stalled-cycles-backend:u # 11.32% backend cycles idle (75.00%) - 81,674,747,927 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.426102928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) + 25,492,945,375 cycles # 2.863 GHz + 78,715,017,784 instructions # 3.09 insn per cycle + 8.905151100 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.061721e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.066293e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.066293e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.251420 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.432714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.435728e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.435728e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.786568 sec INFO: No Floating Point Exceptions have been reported - 11,395,658,100 cycles:u # 3.501 GHz (74.98%) - 1,307,293 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 1,479,855,615 stalled-cycles-backend:u # 12.99% backend cycles idle (74.93%) - 39,187,358,585 instructions:u # 3.44 insn per cycle - # 0.04 stalled cycles per insn (74.93%) - 3.258428830 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) + 12,968,671,480 cycles # 2.709 GHz + 39,227,279,421 instructions # 3.02 insn per cycle + 4.790848376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.194480e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197028e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.197028e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.384380 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.791500e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.806568e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806568e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.114108 sec +INFO: No Floating Point Exceptions have been reported + 5,617,875,214 cycles # 2.653 GHz + 13,801,216,605 instructions # 2.46 insn per cycle + 2.118326582 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.808696e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.827867e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.827867e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.871093 sec INFO: No Floating Point Exceptions have been reported - 4,850,262,020 cycles:u # 3.495 GHz (74.69%) - 444,362 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.68%) - 571,850,754 stalled-cycles-backend:u # 11.79% backend cycles idle (74.97%) - 13,733,484,308 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.22%) - 1.391537224 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) + 4,977,184,975 cycles # 2.656 GHz + 12,467,160,434 instructions # 2.50 insn per cycle + 1.875328468 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.708154e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.719459e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.719459e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.454314 sec +INFO: No Floating Point Exceptions have been reported + 4,118,637,907 cycles # 1.676 GHz + 6,458,862,875 instructions # 1.57 insn per cycle + 2.458530246 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index d21e013e19..452f4e853d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:13:41 +DATE: 2024-05-16_14:53:22 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.394078e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.555997e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.557360e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.635308 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.253411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.278108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.280152e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.538797 sec INFO: No Floating Point Exceptions have been reported - 1,928,274,978 cycles:u # 2.952 GHz (74.32%) - 2,152,412 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.45%) - 5,643,942 stalled-cycles-backend:u # 0.29% backend cycles idle (75.53%) - 2,100,599,060 instructions:u # 1.09 insn per cycle - # 0.00 stalled cycles per insn (75.52%) - 0.680152805 seconds time elapsed + 2,198,780,840 cycles # 2.857 GHz + 3,392,092,682 instructions # 1.54 insn per cycle + 0.826434194 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.241824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244364e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244427e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.691029 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.756018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782691e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783822e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.316768 sec INFO: No Floating Point Exceptions have been reported - 26,521,594,349 cycles:u # 3.435 GHz (75.04%) - 3,163,261 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 6,509,269 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) - 21,161,721,500 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 7.743039364 seconds time elapsed + 10,315,360,608 cycles # 2.881 GHz + 23,624,745,879 instructions # 2.29 insn per cycle + 3.638219909 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.587858e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.588234e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.588234e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.758530 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.179521e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179964e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.179964e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.248920 sec INFO: No Floating Point Exceptions have been reported - 125,378,748,124 cycles:u # 3.506 GHz (74.97%) - 9,493,427 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 17,315,752,460 stalled-cycles-backend:u # 13.81% backend cycles idle (75.00%) - 140,999,688,570 instructions:u # 1.12 insn per cycle - # 0.12 stalled cycles per insn (75.01%) - 35.768742055 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21092) (avx2: 0) (512y: 0) (512z: 0) + 113,511,319,041 cycles # 2.892 GHz + 144,820,446,927 instructions # 1.28 insn per cycle + 39.253177511 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21353) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.663594e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665950e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.488523 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.047626e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.050057e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.050057e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.390362 sec INFO: No Floating Point Exceptions have been reported - 15,741,680,138 cycles:u # 3.504 GHz (74.92%) - 1,724,770 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,822,970,699 stalled-cycles-backend:u # 49.70% backend cycles idle (75.07%) - 37,448,442,237 instructions:u # 2.38 insn per cycle - # 0.21 stalled cycles per insn (75.07%) - 4.496025758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) + 14,740,564,650 cycles # 2.733 GHz + 37,575,494,329 instructions # 2.55 insn per cycle + 5.394647902 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.589511e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.600071e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.600071e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.172740 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.230737e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.243892e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.243892e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.278344 sec +INFO: No Floating Point Exceptions have been reported + 6,134,003,628 cycles # 2.689 GHz + 13,061,930,844 instructions # 2.13 insn per cycle + 2.282738143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.779670e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.799133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.799133e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.877722 sec INFO: No Floating Point Exceptions have been reported - 7,597,061,684 cycles:u # 3.491 GHz (75.01%) - 537,764 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 4,403,802,024 stalled-cycles-backend:u # 57.97% backend cycles idle (75.01%) - 12,901,611,122 instructions:u # 1.70 insn per cycle - # 0.34 stalled cycles per insn (75.01%) - 2.180024620 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) + 5,068,047,565 cycles # 2.694 GHz + 11,440,450,267 instructions # 2.26 insn per cycle + 1.882139324 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.093705e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.106755e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.106755e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.321771 sec +INFO: No Floating Point Exceptions have been reported + 3,974,444,581 cycles # 1.709 GHz + 5,942,873,144 instructions # 1.50 insn per cycle + 2.326156002 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 5dce3f4a7c..00ea23e18d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:14:41 +DATE: 2024-05-16_14:54:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.370095e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428040e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428561e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.516294 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.259147e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.286360e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.536782 sec INFO: No Floating Point Exceptions have been reported - 1,508,920,262 cycles:u # 2.817 GHz (74.13%) - 2,154,249 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.80%) - 5,821,644 stalled-cycles-backend:u # 0.39% backend cycles idle (74.68%) - 1,849,104,202 instructions:u # 1.23 insn per cycle - # 0.00 stalled cycles per insn (74.64%) - 0.560262555 seconds time elapsed + 2,193,506,190 cycles # 2.857 GHz + 3,337,314,407 instructions # 1.52 insn per cycle + 0.824492176 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.739330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.744226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.744350e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.429607 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.761556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.789425e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.301197 sec INFO: No Floating Point Exceptions have been reported - 21,778,736,572 cycles:u # 3.424 GHz (74.89%) - 2,923,981 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 6,184,764 stalled-cycles-backend:u # 0.03% backend cycles idle (74.98%) - 17,426,204,182 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 6.484529429 seconds time elapsed + 10,264,886,616 cycles # 2.886 GHz + 23,377,018,059 instructions # 2.28 insn per cycle + 3.615104997 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 -Avg ME (F77/GPU) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/GPU) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.523620e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.523987e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.523987e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 36.266140 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.170908e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171353e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171353e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.330349 sec INFO: No Floating Point Exceptions have been reported - 127,158,371,335 cycles:u # 3.506 GHz (74.98%) - 75,046,009 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) - 18,252,299,466 stalled-cycles-backend:u # 14.35% backend cycles idle (75.01%) - 141,512,071,410 instructions:u # 1.11 insn per cycle - # 0.13 stalled cycles per insn (75.01%) - 36.273412907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21725) (avx2: 0) (512y: 0) (512z: 0) + 113,688,017,774 cycles # 2.891 GHz + 144,788,018,158 instructions # 1.27 insn per cycle + 39.334720458 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.673222e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675673e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675673e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.476264 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.974783e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.977013e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977013e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.522774 sec INFO: No Floating Point Exceptions have been reported - 15,690,793,260 cycles:u # 3.503 GHz (75.00%) - 1,996,278 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,687,242,311 stalled-cycles-backend:u # 48.99% backend cycles idle (75.00%) - 37,504,259,650 instructions:u # 2.39 insn per cycle - # 0.20 stalled cycles per insn (75.00%) - 4.483480776 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) + 15,220,566,650 cycles # 2.755 GHz + 37,763,046,074 instructions # 2.48 insn per cycle + 5.527045303 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.744723e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.755449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.755449e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.129140 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.412795e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.426610e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.426610e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.222010 sec +INFO: No Floating Point Exceptions have been reported + 6,000,419,836 cycles # 2.696 GHz + 12,896,174,142 instructions # 2.15 insn per cycle + 2.226315650 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.743711e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.762861e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.762861e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.885038 sec INFO: No Floating Point Exceptions have been reported - 7,449,523,167 cycles:u # 3.493 GHz (74.87%) - 392,094 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 4,290,565,052 stalled-cycles-backend:u # 57.60% backend cycles idle (74.87%) - 12,820,412,529 instructions:u # 1.72 insn per cycle - # 0.33 stalled cycles per insn (74.96%) - 2.136397182 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) + 5,086,798,971 cycles # 2.694 GHz + 11,447,968,989 instructions # 2.25 insn per cycle + 1.889284279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.141072e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153903e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153903e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.306249 sec +INFO: No Floating Point Exceptions have been reported + 3,947,559,408 cycles # 1.709 GHz + 5,896,754,674 instructions # 1.49 insn per cycle + 2.310527958 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index e345be8e70..15bbe59069 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:58:49 +DATE: 2024-05-16_14:40:50 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.538301e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.734010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.735595e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.422305 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.326887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370559e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.376371e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487782 sec INFO: No Floating Point Exceptions have been reported - 1,168,629,497 cycles:u # 2.647 GHz (75.35%) - 2,066,166 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.13%) - 5,548,523 stalled-cycles-backend:u # 0.47% backend cycles idle (74.66%) - 1,562,808,277 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 0.468337743 seconds time elapsed + 1,984,813,926 cycles # 2.807 GHz + 2,933,686,219 instructions # 1.48 insn per cycle + 0.764328783 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.698210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.724197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.724638e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.609834 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.584549e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.644337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.647136e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.720884 sec INFO: No Floating Point Exceptions have been reported - 8,802,571,270 cycles:u # 3.338 GHz (74.86%) - 2,415,067 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.10%) - 4,820,027 stalled-cycles-backend:u # 0.05% backend cycles idle (75.11%) - 7,381,991,552 instructions:u # 0.84 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 2.659203909 seconds time elapsed + 5,560,374,951 cycles # 2.849 GHz + 11,900,809,748 instructions # 2.14 insn per cycle + 2.008088048 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.467529e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.468586e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.468586e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.656468 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.909633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910547e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910547e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.596251 sec INFO: No Floating Point Exceptions have been reported - 23,349,191,409 cycles:u # 3.506 GHz (74.98%) - 964,182 stalled-cycles-frontend:u # 0.00% frontend cycles idle (75.01%) - 3,020,129,414 stalled-cycles-backend:u # 12.93% backend cycles idle (75.01%) - 75,781,433,147 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 6.664330081 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) + 24,624,004,022 cycles # 2.864 GHz + 78,129,381,217 instructions # 3.17 insn per cycle + 8.600293639 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.932698e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.950490e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.950490e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.661008 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.891953e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.904635e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.904635e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.387612 sec INFO: No Floating Point Exceptions have been reported - 5,800,808,519 cycles:u # 3.486 GHz (75.01%) - 616,133 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 801,256,663 stalled-cycles-backend:u # 13.81% backend cycles idle (75.01%) - 20,042,368,585 instructions:u # 3.46 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 1.667945050 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) + 6,469,659,104 cycles # 2.706 GHz + 20,120,611,338 instructions # 3.11 insn per cycle + 2.391816623 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.364461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.374777e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.374777e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.703461 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.562010e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.568248e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568248e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.059004 sec INFO: No Floating Point Exceptions have been reported - 2,450,180,140 cycles:u # 3.467 GHz (75.12%) - 319,319 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.10%) - 252,566,161 stalled-cycles-backend:u # 10.31% backend cycles idle (75.10%) - 6,979,289,115 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 0.710302678 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) + 2,818,181,262 cycles # 2.654 GHz + 6,988,460,270 instructions # 2.48 insn per cycle + 1.063195979 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.763183e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.771185e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.771185e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.938662 sec +INFO: No Floating Point Exceptions have been reported + 2,488,393,509 cycles # 2.641 GHz + 6,295,244,635 instructions # 2.53 insn per cycle + 0.942828770 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.363218e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.368048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.368048e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.211358 sec +INFO: No Floating Point Exceptions have been reported + 2,044,658,355 cycles # 1.683 GHz + 3,265,998,063 instructions # 1.60 insn per cycle + 1.215542758 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 1a53d769b8..e281ad389f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:26:00 +DATE: 2024-05-16_15:03:26 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.529555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.696847e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.696847e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.438870 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.615502e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.322427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.322427e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.477115 sec INFO: No Floating Point Exceptions have been reported - 1,286,892,816 cycles:u # 2.789 GHz (73.86%) - 2,836,732 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.06%) - 33,819,914 stalled-cycles-backend:u # 2.63% backend cycles idle (74.60%) - 1,635,906,811 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (75.68%) - 0.483134647 seconds time elapsed + 1,936,349,619 cycles # 2.809 GHz + 2,877,179,431 instructions # 1.49 insn per cycle + 0.747561501 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.260775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.701838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.701838e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.442850 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.243623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.556013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.556013e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.907816 sec INFO: No Floating Point Exceptions have been reported - 11,602,142,051 cycles:u # 3.342 GHz (74.91%) - 22,272,146 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.97%) - 1,166,093,542 stalled-cycles-backend:u # 10.05% backend cycles idle (75.11%) - 9,908,851,401 instructions:u # 0.85 insn per cycle - # 0.12 stalled cycles per insn (75.10%) - 3.493619221 seconds time elapsed + 6,131,638,198 cycles # 2.845 GHz + 12,981,768,605 instructions # 2.12 insn per cycle + 2.213144159 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.465573e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.466633e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.466633e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.663414 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.909165e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910120e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910120e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.600727 sec INFO: No Floating Point Exceptions have been reported - 23,375,582,467 cycles:u # 3.506 GHz (74.92%) - 1,998,077 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 3,094,020,929 stalled-cycles-backend:u # 13.24% backend cycles idle (75.03%) - 75,778,856,835 instructions:u # 3.24 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 6.670436994 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) + 24,637,778,479 cycles # 2.864 GHz + 78,132,610,249 instructions # 3.17 insn per cycle + 8.604942209 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.820071e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.838035e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.838035e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.681875 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.457452e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.468775e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.468775e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.550311 sec INFO: No Floating Point Exceptions have been reported - 5,818,447,218 cycles:u # 3.461 GHz (74.87%) - 631,971 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) - 818,517,046 stalled-cycles-backend:u # 14.07% backend cycles idle (74.88%) - 20,110,770,654 instructions:u # 3.46 insn per cycle - # 0.04 stalled cycles per insn (74.90%) - 1.688879505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) + 6,935,275,139 cycles # 2.716 GHz + 20,130,100,658 instructions # 2.90 insn per cycle + 2.554710358 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.375608e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.385909e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.385909e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.702501 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.550561e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557160e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557160e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.068476 sec INFO: No Floating Point Exceptions have been reported - 2,454,393,783 cycles:u # 3.476 GHz (74.97%) - 239,024 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.08%) - 246,569,377 stalled-cycles-backend:u # 10.05% backend cycles idle (75.07%) - 6,976,891,637 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 0.709898530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) + 2,830,711,742 cycles # 2.640 GHz + 6,997,830,070 instructions # 2.47 insn per cycle + 1.072903816 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.772339e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.780808e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780808e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.936387 sec +INFO: No Floating Point Exceptions have been reported + 2,497,824,247 cycles # 2.658 GHz + 6,305,168,616 instructions # 2.52 insn per cycle + 0.940674173 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.362852e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.367803e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367803e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.214531 sec +INFO: No Floating Point Exceptions have been reported + 2,054,265,568 cycles # 1.686 GHz + 3,276,400,100 instructions # 1.59 insn per cycle + 1.218830996 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 7fcde53999..1c3846a692 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:32:25 +DATE: 2024-05-16_15:13:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.525311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.726404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.728003e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.439368 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.362325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.415082e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.420724e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.471632 sec INFO: No Floating Point Exceptions have been reported - 1,270,344,241 cycles:u # 2.734 GHz (73.63%) - 2,448,524 stalled-cycles-frontend:u # 0.19% frontend cycles idle (73.94%) - 34,050,399 stalled-cycles-backend:u # 2.68% backend cycles idle (74.86%) - 1,609,817,107 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (75.69%) - 0.486682472 seconds time elapsed + 1,968,514,245 cycles # 2.841 GHz + 2,902,581,432 instructions # 1.47 insn per cycle + 0.750206216 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.686205e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.717014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.717447e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.305003 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.620229e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.693332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.696842e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.810763 sec INFO: No Floating Point Exceptions have been reported - 11,105,900,651 cycles:u # 3.329 GHz (75.02%) - 11,040,635 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.07%) - 1,144,196,478 stalled-cycles-backend:u # 10.30% backend cycles idle (75.07%) - 9,026,212,245 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.02%) - 3.358176383 seconds time elapsed + 5,856,364,996 cycles # 2.870 GHz + 12,360,478,892 instructions # 2.11 insn per cycle + 2.100167053 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.469495e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.470544e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.470544e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.651185 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.921942e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.922884e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922884e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 8.542686 sec INFO: No Floating Point Exceptions have been reported - 23,314,047,464 cycles:u # 3.504 GHz (74.99%) - 1,023,835 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.99%) - 2,989,172,128 stalled-cycles-backend:u # 12.82% backend cycles idle (74.99%) - 75,802,850,533 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.655153632 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) + 24,622,493,732 cycles # 2.881 GHz + 78,127,963,456 instructions # 3.17 insn per cycle + 8.546707601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.890088e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.907950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.907950e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.668323 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.925135e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.937766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.937766e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 2.378023 sec INFO: No Floating Point Exceptions have been reported - 5,857,650,027 cycles:u # 3.506 GHz (74.75%) - 2,083,109 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.98%) - 822,777,492 stalled-cycles-backend:u # 14.05% backend cycles idle (75.10%) - 20,043,885,758 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (75.10%) - 1.672316378 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) + 6,481,719,151 cycles # 2.722 GHz + 20,120,720,773 instructions # 3.10 insn per cycle + 2.382079719 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.373497e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.383649e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.383649e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.700820 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.581338e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.587855e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.587855e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 1.046370 sec INFO: No Floating Point Exceptions have been reported - 2,440,546,902 cycles:u # 3.470 GHz (74.98%) - 205,681 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 257,963,467 stalled-cycles-backend:u # 10.57% backend cycles idle (74.98%) - 6,981,870,931 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 0.704600207 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) + 2,822,358,408 cycles # 2.688 GHz + 6,985,542,199 instructions # 2.48 insn per cycle + 1.050425346 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.806198e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814674e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.917862 sec +INFO: No Floating Point Exceptions have been reported + 2,496,110,223 cycles # 2.709 GHz + 6,293,657,033 instructions # 2.52 insn per cycle + 0.921934399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.393764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.398765e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.398765e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.186576 sec +INFO: No Floating Point Exceptions have been reported + 2,050,577,153 cycles # 1.723 GHz + 3,264,219,053 instructions # 1.59 insn per cycle + 1.190613213 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 76da60c048..97148e3ba7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -40,190 +40,190 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_09:28:14 +DATE: 2024-05-16_15:10:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.328799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.380015e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.385833e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.326390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.378340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.384052e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.466913 sec +TOTAL : 0.468432 sec INFO: No Floating Point Exceptions have been reported - 1,916,716,842 cycles # 2.819 GHz - 2,909,024,684 instructions # 1.52 insn per cycle - 0.736711953 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst + 1,956,153,885 cycles # 2.820 GHz + 2,925,124,547 instructions # 1.50 insn per cycle + 0.750741002 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.577095e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.650934e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.654290e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.616852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.690868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.694290e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.755954 sec +TOTAL : 1.758835 sec INFO: No Floating Point Exceptions have been reported - 5,660,148,395 cycles # 2.847 GHz - 12,178,684,287 instructions # 2.15 insn per cycle - 2.045588387 seconds time elapsed + 5,694,632,258 cycles # 2.846 GHz + 12,170,382,669 instructions # 2.14 insn per cycle + 2.057387110 seconds time elapsed ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 Avg ME (F77/GPU) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908931e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909878e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909878e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910170e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.911121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.911121e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.599612 sec +TOTAL : 8.594641 sec INFO: No Floating Point Exceptions have been reported - 24,627,865,782 cycles # 2.863 GHz - 78,128,090,050 instructions # 3.17 insn per cycle - 8.603669298 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,610,525,016 cycles # 2.863 GHz + 78,132,278,540 instructions # 3.17 insn per cycle + 8.598723021 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.850796e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.863212e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.863212e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.888566e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.900956e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.900956e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.402591 sec +TOTAL : 2.388862 sec INFO: No Floating Point Exceptions have been reported - 6,479,231,608 cycles # 2.695 GHz - 20,122,059,228 instructions # 3.11 insn per cycle - 2.406715396 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,476,954,136 cycles # 2.708 GHz + 20,121,920,046 instructions # 3.11 insn per cycle + 2.393015096 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.562566e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568871e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568871e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.562187e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.568435e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568435e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.057934 sec +TOTAL : 1.057977 sec INFO: No Floating Point Exceptions have been reported - 2,818,265,341 cycles # 2.655 GHz - 6,988,053,760 instructions # 2.48 insn per cycle - 1.062033801 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,818,730,747 cycles # 2.656 GHz + 6,988,428,853 instructions # 2.48 insn per cycle + 1.062013241 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.768732e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.776855e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.776855e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.767100e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.775059e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.775059e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.935438 sec +TOTAL : 0.936311 sec INFO: No Floating Point Exceptions have been reported - 2,490,297,733 cycles # 2.652 GHz - 6,295,843,677 instructions # 2.53 insn per cycle - 0.939633973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,487,367,369 cycles # 2.647 GHz + 6,295,352,067 instructions # 2.53 insn per cycle + 0.940344403 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.362182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367071e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367071e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.354758e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.359575e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.359575e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.212324 sec +TOTAL : 1.218960 sec INFO: No Floating Point Exceptions have been reported - 2,046,105,310 cycles # 1.683 GHz - 3,266,428,243 instructions # 1.60 insn per cycle - 1.216361843 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,057,435,423 cycles # 1.683 GHz + 3,266,628,935 instructions # 1.59 insn per cycle + 1.223151915 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 2f6c028d0e..dc12ca7aae 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,195 +1,236 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:30:18 +DATE: 2024-05-16_15:07:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.562827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.726981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.727629e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.432096 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.747793e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.405382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.411341e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.473580 sec INFO: No Floating Point Exceptions have been reported - 1,239,375,288 cycles:u # 2.720 GHz (75.36%) - 2,457,751 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.44%) - 51,259,680 stalled-cycles-backend:u # 4.14% backend cycles idle (75.44%) - 1,635,658,699 instructions:u # 1.32 insn per cycle - # 0.03 stalled cycles per insn (75.48%) - 0.471477263 seconds time elapsed + 1,929,031,590 cycles # 2.811 GHz + 2,902,080,173 instructions # 1.50 insn per cycle + 0.744461149 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP= +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.287375e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.712743e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713274e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.424943 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.464876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.690964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.694375e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.841417 sec INFO: No Floating Point Exceptions have been reported - 11,507,485,887 cycles:u # 3.351 GHz (74.89%) - 21,524,913 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.07%) - 1,134,644,407 stalled-cycles-backend:u # 9.86% backend cycles idle (75.09%) - 9,786,678,640 instructions:u # 0.85 insn per cycle - # 0.12 stalled cycles per insn (75.08%) - 3.469627658 seconds time elapsed + 5,892,322,421 cycles # 2.846 GHz + 12,206,550,799 instructions # 2.07 insn per cycle + 2.128532659 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.461528e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.462582e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.462582e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.672502 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.911509e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.912427e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912427e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.588000 sec INFO: No Floating Point Exceptions have been reported - 23,348,014,194 cycles:u # 3.499 GHz (74.96%) - 1,432,309 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,047,985,824 stalled-cycles-backend:u # 13.05% backend cycles idle (74.96%) - 75,854,593,993 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 6.676447910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) + 24,603,486,303 cycles # 2.864 GHz + 78,128,844,221 instructions # 3.18 insn per cycle + 8.592028071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.878591e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.896526e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.896526e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.670139 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.897521e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.909886e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.909886e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.385730 sec INFO: No Floating Point Exceptions have been reported - 5,859,735,192 cycles:u # 3.503 GHz (74.68%) - 357,232 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 817,209,696 stalled-cycles-backend:u # 13.95% backend cycles idle (75.13%) - 20,050,584,379 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (75.13%) - 1.674097325 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) + 6,477,077,766 cycles # 2.711 GHz + 20,121,628,941 instructions # 3.11 insn per cycle + 2.389950461 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.374346e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.384755e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.384755e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.700499 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.564279e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.570570e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.570570e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.056604 sec INFO: No Floating Point Exceptions have been reported - 2,440,629,579 cycles:u # 3.471 GHz (74.97%) - 229,723 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 247,928,603 stalled-cycles-backend:u # 10.16% backend cycles idle (74.97%) - 6,980,386,979 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 0.704321246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) + 2,817,814,854 cycles # 2.658 GHz + 6,988,003,654 instructions # 2.48 insn per cycle + 1.060745031 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769248e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.777272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.777272e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.935150 sec +INFO: No Floating Point Exceptions have been reported + 2,489,664,656 cycles # 2.652 GHz + 6,295,373,565 instructions # 2.53 insn per cycle + 0.939255376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.359069e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.363945e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363945e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.214973 sec +INFO: No Floating Point Exceptions have been reported + 2,049,104,437 cycles # 1.682 GHz + 3,266,431,248 instructions # 1.59 insn per cycle + 1.219018056 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 3efaa8b447..fb9b3d5f50 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:59:07 +DATE: 2024-05-16_14:41:16 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.534350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.718064e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719919e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.422348 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.355693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.401036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406994e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.484635 sec INFO: No Floating Point Exceptions have been reported - 1,182,384,413 cycles:u # 2.672 GHz (74.19%) - 2,177,985 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.55%) - 5,597,910 stalled-cycles-backend:u # 0.47% backend cycles idle (74.72%) - 1,592,050,914 instructions:u # 1.35 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 0.468920316 seconds time elapsed + 1,972,569,534 cycles # 2.816 GHz + 2,939,499,932 instructions # 1.49 insn per cycle + 0.757402101 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.708072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.741034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.741481e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.631260 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.619113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.679355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.682149e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.713739 sec INFO: No Floating Point Exceptions have been reported - 8,744,418,917 cycles:u # 3.322 GHz (75.08%) - 2,411,124 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.11%) - 4,627,952 stalled-cycles-backend:u # 0.05% backend cycles idle (75.12%) - 7,473,196,726 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 2.679032858 seconds time elapsed + 5,540,767,327 cycles # 2.848 GHz + 11,699,037,597 instructions # 2.11 insn per cycle + 2.001424634 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.471911e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.472964e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.472964e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.644527 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.914726e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.915647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915647e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.573445 sec INFO: No Floating Point Exceptions have been reported - 23,292,200,546 cycles:u # 3.504 GHz (74.97%) - 941,401 stalled-cycles-frontend:u # 0.00% frontend cycles idle (74.97%) - 2,793,996,180 stalled-cycles-backend:u # 12.00% backend cycles idle (74.97%) - 75,764,862,603 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 6.651612901 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) + 24,558,709,341 cycles # 2.863 GHz + 77,854,833,330 instructions # 3.17 insn per cycle + 8.577561930 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866108667618E-004 -Relative difference = 5.871505118544242e-08 +Avg ME (F77/C++) = 6.6274866268634797E-004 +Relative difference = 5.630135835748959e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.941464e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.959132e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.959132e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.659366 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.975982e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989277e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.989277e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.359034 sec INFO: No Floating Point Exceptions have been reported - 5,807,726,656 cycles:u # 3.493 GHz (75.03%) - 691,045 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 927,786,568 stalled-cycles-backend:u # 15.98% backend cycles idle (74.98%) - 20,036,844,991 instructions:u # 3.45 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 1.666336859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) + 6,427,044,365 cycles # 2.721 GHz + 20,086,102,386 instructions # 3.13 insn per cycle + 2.363343503 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861465384638E-004 +Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.385038e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.395436e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.395436e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.697217 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.504468e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510257e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510257e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.097904 sec INFO: No Floating Point Exceptions have been reported - 2,432,905,799 cycles:u # 3.473 GHz (74.99%) - 224,108 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 295,138,287 stalled-cycles-backend:u # 12.13% backend cycles idle (74.88%) - 6,977,300,556 instructions:u # 2.87 insn per cycle - # 0.04 stalled cycles per insn (74.88%) - 0.704182951 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) + 2,922,790,348 cycles # 2.654 GHz + 7,129,934,034 instructions # 2.44 insn per cycle + 1.101954791 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.699271e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.706623e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.706623e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.973114 sec +INFO: No Floating Point Exceptions have been reported + 2,595,556,237 cycles # 2.658 GHz + 6,438,662,691 instructions # 2.48 insn per cycle + 0.977341866 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.316379e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.321013e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.321013e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.254012 sec +INFO: No Floating Point Exceptions have been reported + 2,116,081,195 cycles # 1.683 GHz + 3,427,806,501 instructions # 1.62 insn per cycle + 1.258282002 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2924) (512y: 22) (512z: 9654) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952032322112E-004 +Relative difference = 3.066639970473621e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 71b4455a14..3f8f67a608 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:15:40 +DATE: 2024-05-16_14:55:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.563101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.729671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.731133e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.421043 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.546626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.586411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.590653e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.493388 sec INFO: No Floating Point Exceptions have been reported - 1,178,996,820 cycles:u # 2.673 GHz (74.34%) - 2,271,227 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.50%) - 5,966,417 stalled-cycles-backend:u # 0.51% backend cycles idle (74.63%) - 1,576,303,170 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (74.92%) - 0.467983265 seconds time elapsed + 2,052,942,224 cycles # 2.851 GHz + 3,071,897,705 instructions # 1.50 insn per cycle + 0.778091403 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.698669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.723715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.724146e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.608747 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.711232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.769137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.771784e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.862053 sec INFO: No Floating Point Exceptions have been reported - 8,761,412,499 cycles:u # 3.326 GHz (75.10%) - 2,390,369 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.08%) - 4,812,855 stalled-cycles-backend:u # 0.05% backend cycles idle (75.02%) - 7,478,919,047 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 2.658405479 seconds time elapsed + 6,039,204,312 cycles # 2.872 GHz + 11,937,016,347 instructions # 1.98 insn per cycle + 2.158356809 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.224731e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.225405e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.225405e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 26.356894 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.454006e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.454774e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.454774e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 30.077717 sec INFO: No Floating Point Exceptions have been reported - 92,378,281,750 cycles:u # 3.505 GHz (74.99%) - 527,283,671 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.99%) - 6,727,526,078 stalled-cycles-backend:u # 7.28% backend cycles idle (74.99%) - 133,727,657,284 instructions:u # 1.45 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 26.364122211 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16437) (avx2: 0) (512y: 0) (512z: 0) + 86,228,096,895 cycles # 2.867 GHz + 135,581,749,205 instructions # 1.57 insn per cycle + 30.081848617 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275342244977858E-004 -Relative difference = 3.387350194093721e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275351196781740E-004 +Relative difference = 1.805772034719401e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 8.428594e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.441253e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.441253e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.955855 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.767198e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.779100e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.779100e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.432356 sec INFO: No Floating Point Exceptions have been reported - 6,866,879,302 cycles:u # 3.505 GHz (74.79%) - 366,236 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.85%) - 3,373,512,128 stalled-cycles-backend:u # 49.13% backend cycles idle (75.04%) - 19,109,724,267 instructions:u # 2.78 insn per cycle - # 0.18 stalled cycles per insn (75.09%) - 1.962929375 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) + 6,776,462,064 cycles # 2.783 GHz + 19,386,992,522 instructions # 2.86 insn per cycle + 2.436630257 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857053714997E-004 -Relative difference = 4.445554471174176e-08 +Avg ME (F77/C++) = 6.6274862707273868E-004 +Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.480723e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.484775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.484775e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.117590 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.415254e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.420302e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.420302e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.167177 sec +INFO: No Floating Point Exceptions have been reported + 3,174,327,264 cycles # 2.711 GHz + 6,807,988,001 instructions # 2.14 insn per cycle + 1.171487938 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.702865e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.710950e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.710950e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.971352 sec INFO: No Floating Point Exceptions have been reported - 3,895,917,613 cycles:u # 3.476 GHz (75.02%) - 406,967 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 2,245,604,879 stalled-cycles-backend:u # 57.64% backend cycles idle (75.02%) - 6,710,083,492 instructions:u # 1.72 insn per cycle - # 0.33 stalled cycles per insn (75.02%) - 1.124488383 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) + 2,641,359,018 cycles # 2.709 GHz + 5,985,956,533 instructions # 2.27 insn per cycle + 0.975633569 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735722101156E-004 -Relative difference = 6.454990161554483e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.382138e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387143e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387143e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.195051 sec +INFO: No Floating Point Exceptions have been reported + 2,079,765,601 cycles # 1.735 GHz + 3,501,460,071 instructions # 1.68 insn per cycle + 1.199295448 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44829) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750363879224E-004 +Relative difference = 5.490631193034436e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index e4136e56b7..f651d28060 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_16:16:21 +DATE: 2024-05-16_14:56:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.555444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.735249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.735948e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.438925 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.511551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.549792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.554590e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.495451 sec INFO: No Floating Point Exceptions have been reported - 1,192,597,442 cycles:u # 2.710 GHz (74.12%) - 2,199,726 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.58%) - 5,840,015 stalled-cycles-backend:u # 0.49% backend cycles idle (75.09%) - 1,542,600,689 instructions:u # 1.29 insn per cycle - # 0.00 stalled cycles per insn (76.18%) - 0.483953210 seconds time elapsed + 2,081,587,427 cycles # 2.833 GHz + 3,058,350,902 instructions # 1.47 insn per cycle + 0.791962629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.708918e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.735812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736260e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.675956 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.609535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.664835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.667390e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.882393 sec INFO: No Floating Point Exceptions have been reported - 8,744,995,176 cycles:u # 3.323 GHz (75.04%) - 2,329,939 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.07%) - 4,549,200 stalled-cycles-backend:u # 0.05% backend cycles idle (75.03%) - 7,448,478,363 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 2.723248557 seconds time elapsed + 6,108,387,546 cycles # 2.880 GHz + 12,340,826,531 instructions # 2.02 insn per cycle + 2.177538628 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 6.626791e-04 -Avg ME (F77/GPU) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 6.626454e-04 +Avg ME (F77/GPU) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.356777e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.357473e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.357473e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 25.809750 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.501753e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.502516e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502516e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.816879 sec INFO: No Floating Point Exceptions have been reported - 90,470,344,269 cycles:u # 3.505 GHz (74.99%) - 173,834,865 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.99%) - 6,717,529,314 stalled-cycles-backend:u # 7.43% backend cycles idle (74.99%) - 134,197,137,791 instructions:u # 1.48 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 25.816836967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16752) (avx2: 0) (512y: 0) (512z: 0) + 86,332,810,324 cycles # 2.895 GHz + 136,005,056,328 instructions # 1.58 insn per cycle + 29.820915946 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15571) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275342811702997E-004 -Relative difference = 4.242457295829522e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275348988418387E-004 +Relative difference = 1.5263316105958472e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 8.233502e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.245586e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.245586e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.001883 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.665500e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.677672e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.677672e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.468426 sec INFO: No Floating Point Exceptions have been reported - 7,007,440,529 cycles:u # 3.495 GHz (74.98%) - 1,787,441 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) - 3,067,895,945 stalled-cycles-backend:u # 43.78% backend cycles idle (74.87%) - 19,195,171,512 instructions:u # 2.74 insn per cycle - # 0.16 stalled cycles per insn (74.87%) - 2.008917088 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) + 6,845,942,231 cycles # 2.769 GHz + 19,438,050,467 instructions # 2.84 insn per cycle + 2.472841886 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857044990032E-004 -Relative difference = 4.4587192899226015e-08 +Avg ME (F77/C++) = 6.6274862764021530E-004 +Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.499144e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.503166e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.503166e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.103765 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.417376e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.422501e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.422501e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.165327 sec +INFO: No Floating Point Exceptions have been reported + 3,124,352,057 cycles # 2.673 GHz + 6,718,803,660 instructions # 2.15 insn per cycle + 1.169556736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.691458e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698947e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698947e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.977716 sec INFO: No Floating Point Exceptions have been reported - 3,857,870,137 cycles:u # 3.485 GHz (74.76%) - 570,661 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.71%) - 2,192,082,521 stalled-cycles-backend:u # 56.82% backend cycles idle (74.71%) - 6,681,566,942 instructions:u # 1.73 insn per cycle - # 0.33 stalled cycles per insn (74.90%) - 1.110730332 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) + 2,637,169,918 cycles # 2.688 GHz + 5,969,286,098 instructions # 2.26 insn per cycle + 0.981942660 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735755491807E-004 -Relative difference = 6.404606472340801e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.359514e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.364207e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.364207e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.214640 sec +INFO: No Floating Point Exceptions have been reported + 2,077,190,375 cycles # 1.705 GHz + 3,494,266,618 instructions # 1.68 insn per cycle + 1.219012886 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44472) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750384530066E-004 +Relative difference = 5.80223501432476e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index efb7e8f517..19b36f52e3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:59:24 +DATE: 2024-05-16_14:41:41 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.406054e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.555668e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.557044e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.634300 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.461685e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487571e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529512 sec INFO: No Floating Point Exceptions have been reported - 1,913,340,364 cycles:u # 2.926 GHz (74.47%) - 2,124,225 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.44%) - 6,045,559 stalled-cycles-backend:u # 0.32% backend cycles idle (75.55%) - 2,113,671,614 instructions:u # 1.10 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 0.680569111 seconds time elapsed + 2,178,837,447 cycles # 2.822 GHz + 3,364,663,947 instructions # 1.54 insn per cycle + 0.830788537 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245621e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245684e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.684762 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.126264e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.153475e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.154649e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.029127 sec INFO: No Floating Point Exceptions have been reported - 26,524,644,220 cycles:u # 3.438 GHz (75.00%) - 2,990,750 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 5,314,402 stalled-cycles-backend:u # 0.02% backend cycles idle (75.01%) - 21,105,750,957 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 7.739734597 seconds time elapsed + 9,370,132,482 cycles # 2.854 GHz + 19,961,685,193 instructions # 2.13 insn per cycle + 3.339329204 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.175512e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.176365e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176365e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.550207 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.814801e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.815638e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.815638e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.046315 sec INFO: No Floating Point Exceptions have been reported - 26,485,092,444 cycles:u # 3.506 GHz (74.98%) - 25,690,494 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.01%) - 3,653,274,032 stalled-cycles-backend:u # 13.79% backend cycles idle (75.01%) - 82,369,927,258 instructions:u # 3.11 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 7.557542229 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) + 25,898,843,827 cycles # 2.862 GHz + 79,438,691,532 instructions # 3.07 insn per cycle + 9.050548799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.093761e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.098498e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.098498e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.231168 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.427576e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.430583e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.430583e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.794109 sec INFO: No Floating Point Exceptions have been reported - 11,330,244,281 cycles:u # 3.503 GHz (74.96%) - 3,524,069 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.02%) - 1,430,983,502 stalled-cycles-backend:u # 12.63% backend cycles idle (75.02%) - 38,463,632,777 instructions:u # 3.39 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 3.238452285 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) + 12,707,110,349 cycles # 2.649 GHz + 38,549,995,901 instructions # 3.03 insn per cycle + 4.798432518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.222556e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.225223e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.225223e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.352936 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.947160e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.962973e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.962973e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.073357 sec +INFO: No Floating Point Exceptions have been reported + 5,517,673,615 cycles # 2.658 GHz + 13,479,814,632 instructions # 2.44 insn per cycle + 2.077628129 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.092853e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.113453e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.113453e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.813039 sec INFO: No Floating Point Exceptions have been reported - 4,737,672,929 cycles:u # 3.493 GHz (74.70%) - 588,360 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.66%) - 463,826,293 stalled-cycles-backend:u # 9.79% backend cycles idle (74.92%) - 13,523,757,879 instructions:u # 2.85 insn per cycle - # 0.03 stalled cycles per insn (75.20%) - 1.360047826 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) + 4,828,852,439 cycles # 2.658 GHz + 12,135,084,334 instructions # 2.51 insn per cycle + 1.817332368 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.671146e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.682190e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.682190e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.468744 sec +INFO: No Floating Point Exceptions have been reported + 4,141,507,976 cycles # 1.676 GHz + 6,337,241,929 instructions # 1.53 insn per cycle + 2.472886901 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index c13c1962d8..10c707e81e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-05-15_15:59:51 +DATE: 2024-05-16_14:42:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.391539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.447684e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.448332e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.514066 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.482619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.506518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.508744e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.530792 sec INFO: No Floating Point Exceptions have been reported - 1,514,719,274 cycles:u # 2.832 GHz (73.48%) - 2,184,743 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.85%) - 6,269,032 stalled-cycles-backend:u # 0.41% backend cycles idle (74.87%) - 1,874,596,872 instructions:u # 1.24 insn per cycle - # 0.00 stalled cycles per insn (74.60%) - 0.560203306 seconds time elapsed + 2,175,803,522 cycles # 2.817 GHz + 3,378,965,043 instructions # 1.55 insn per cycle + 0.832396723 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.736537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.741332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.741454e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.340456 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.148343e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.177034e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.021680 sec INFO: No Floating Point Exceptions have been reported - 21,777,848,510 cycles:u # 3.418 GHz (75.05%) - 2,846,897 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.06%) - 5,785,834 stalled-cycles-backend:u # 0.03% backend cycles idle (75.02%) - 17,480,234,133 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 6.395018977 seconds time elapsed + 9,373,127,330 cycles # 2.855 GHz + 21,008,547,067 instructions # 2.24 insn per cycle + 3.339100414 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626675e-04 Avg ME (F77/GPU) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.206402e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207273e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207273e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.444427 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.816314e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817146e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.817146e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.038908 sec INFO: No Floating Point Exceptions have been reported - 26,080,803,671 cycles:u # 3.502 GHz (74.97%) - 8,879,835 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 3,448,291,666 stalled-cycles-backend:u # 13.22% backend cycles idle (74.97%) - 82,322,663,207 instructions:u # 3.16 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 7.452187120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) + 25,883,565,851 cycles # 2.863 GHz + 79,454,182,113 instructions # 3.07 insn per cycle + 9.043041112 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.116013e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.120719e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.120719e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.217094 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.444359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.447378e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447378e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.770841 sec INFO: No Floating Point Exceptions have been reported - 11,270,729,218 cycles:u # 3.500 GHz (74.93%) - 3,423,653 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.91%) - 1,343,947,151 stalled-cycles-backend:u # 11.92% backend cycles idle (74.91%) - 38,545,738,505 instructions:u # 3.42 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 3.224245404 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) + 12,673,930,469 cycles # 2.656 GHz + 38,521,208,960 instructions # 3.04 insn per cycle + 4.775036357 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.221203e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.223851e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.223851e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.354247 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.869925e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885399e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.885399e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.093382 sec +INFO: No Floating Point Exceptions have been reported + 5,571,627,209 cycles # 2.657 GHz + 13,607,217,607 instructions # 2.44 insn per cycle + 2.097652206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.920636e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.941759e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.941759e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.847928 sec INFO: No Floating Point Exceptions have been reported - 4,741,335,297 cycles:u # 3.492 GHz (74.68%) - 405,247 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.73%) - 523,135,036 stalled-cycles-backend:u # 11.03% backend cycles idle (75.02%) - 13,528,084,025 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.26%) - 1.361943677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) + 4,911,801,030 cycles # 2.653 GHz + 12,271,296,407 instructions # 2.50 insn per cycle + 1.852091714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.658898e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.670376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.670376e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.472418 sec +INFO: No Floating Point Exceptions have been reported + 4,148,038,447 cycles # 1.675 GHz + 6,442,551,576 instructions # 1.55 insn per cycle + 2.476725391 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index c9f0b40010..5b5bd116a3 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:00:37 +DATE: 2024-05-16_14:44:06 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.980820e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.986296e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.986398e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.456571 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.065678e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.066059e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066269e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.433623 sec INFO: No Floating Point Exceptions have been reported - 32,758,675,987 cycles:u # 3.456 GHz (75.00%) - 3,432,440 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 7,654,967 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 25,905,703,431 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.506575067 seconds time elapsed + 7,848,341,490 cycles # 2.847 GHz + 17,462,165,188 instructions # 2.22 insn per cycle + 2.813787845 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.514663e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.517911e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.517935e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.036392 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.279047e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.280919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.281197e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.985795 sec INFO: No Floating Point Exceptions have been reported - 31,311,548,433 cycles:u # 3.457 GHz (74.97%) - 3,533,955 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 7,913,429 stalled-cycles-backend:u # 0.03% backend cycles idle (75.03%) - 24,752,693,903 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 9.082002133 seconds time elapsed + 12,317,841,072 cycles # 2.855 GHz + 29,065,647,551 instructions # 2.36 insn per cycle + 4.369250222 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/GPU) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.027435e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.027462e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.027462e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.144605 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.769533e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.769744e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.769744e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.808252 sec INFO: No Floating Point Exceptions have been reported - 18,028,402,409 cycles:u # 3.502 GHz (74.95%) - 27,611,368 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.98%) - 2,245,790,882 stalled-cycles-backend:u # 12.46% backend cycles idle (74.98%) - 55,127,058,667 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 5.151610353 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) + 18,798,528,369 cycles # 2.760 GHz + 53,916,630,138 instructions # 2.87 insn per cycle + 6.812355714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.223400e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.223527e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.223527e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.380373 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.539980e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.540062e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.540062e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.433152 sec INFO: No Floating Point Exceptions have been reported - 8,333,152,705 cycles:u # 3.496 GHz (74.86%) - 2,559,647 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.83%) - 808,877,867 stalled-cycles-backend:u # 9.71% backend cycles idle (74.84%) - 27,042,969,545 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 2.387686992 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) + 9,799,231,624 cycles # 2.852 GHz + 27,092,581,938 instructions # 2.76 insn per cycle + 3.437235180 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.140048e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.140715e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.140715e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.032474 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.326889e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.327284e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.327284e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.592072 sec +INFO: No Floating Point Exceptions have been reported + 4,220,179,984 cycles # 2.645 GHz + 9,560,887,701 instructions # 2.27 insn per cycle + 1.596045022 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.770010e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.770613e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.770613e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.405649 sec INFO: No Floating Point Exceptions have been reported - 3,618,339,129 cycles:u # 3.494 GHz (74.63%) - 1,807,361 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.02%) - 325,325,784 stalled-cycles-backend:u # 8.99% backend cycles idle (75.29%) - 9,521,952,296 instructions:u # 2.63 insn per cycle - # 0.03 stalled cycles per insn (75.29%) - 1.039807225 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) + 3,726,923,548 cycles # 2.645 GHz + 8,484,897,516 instructions # 2.28 insn per cycle + 1.409716339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.281739e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.282258e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.282258e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.614138 sec +INFO: No Floating Point Exceptions have been reported + 2,690,163,143 cycles # 1.663 GHz + 4,272,866,756 instructions # 1.59 insn per cycle + 1.618172762 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 93fc2cd34c..1be1cfeedf 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:26:19 +DATE: 2024-05-16_15:03:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.930121e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.930842e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.930842e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.502413 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.063403e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.064386e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.064386e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.395260 sec INFO: No Floating Point Exceptions have been reported - 32,946,992,049 cycles:u # 3.459 GHz (74.96%) - 3,785,031 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 9,038,690 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 26,035,342,716 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.549240792 seconds time elapsed + 7,755,019,436 cycles # 2.852 GHz + 17,230,726,903 instructions # 2.22 insn per cycle + 2.775067655 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.507144e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510882e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.510882e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.080724 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.263412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.297943e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.297943e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.971807 sec INFO: No Floating Point Exceptions have been reported - 31,494,745,424 cycles:u # 3.459 GHz (74.97%) - 4,919,660 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.01%) - 49,642,487 stalled-cycles-backend:u # 0.16% backend cycles idle (75.03%) - 24,874,270,522 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 9.126106525 seconds time elapsed + 12,283,103,403 cycles # 2.855 GHz + 27,758,308,143 instructions # 2.26 insn per cycle + 4.357937638 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/GPU) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.023132e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023159e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023159e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.166347 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.361041e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.361236e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.361236e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.177191 sec INFO: No Floating Point Exceptions have been reported - 18,095,619,775 cycles:u # 3.501 GHz (74.93%) - 27,836,214 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.93%) - 1,988,371,558 stalled-cycles-backend:u # 10.99% backend cycles idle (74.95%) - 55,156,193,382 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.172999600 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) + 18,941,402,140 cycles # 2.638 GHz + 53,918,413,850 instructions # 2.85 insn per cycle + 7.181202320 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.235258e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.235385e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.235385e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.368254 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.538220e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.538308e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.538308e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.437404 sec INFO: No Floating Point Exceptions have been reported - 8,294,074,370 cycles:u # 3.498 GHz (74.99%) - 1,791,455 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) - 769,410,399 stalled-cycles-backend:u # 9.28% backend cycles idle (75.04%) - 26,992,585,217 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 2.374877299 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) + 9,826,263,323 cycles # 2.856 GHz + 27,093,421,705 instructions # 2.76 insn per cycle + 3.441586183 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.198364e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.199069e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.199069e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.021970 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.320877e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321321e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321321e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.595267 sec INFO: No Floating Point Exceptions have been reported - 3,561,730,996 cycles:u # 3.474 GHz (75.04%) - 1,144,548 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) - 252,785,073 stalled-cycles-backend:u # 7.10% backend cycles idle (75.04%) - 9,519,651,231 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 1.028867905 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) + 4,226,325,559 cycles # 2.644 GHz + 9,562,000,988 instructions # 2.26 insn per cycle + 1.599357751 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.768698e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.769266e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.769266e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.406299 sec +INFO: No Floating Point Exceptions have been reported + 3,728,202,948 cycles # 2.645 GHz + 8,485,828,873 instructions # 2.28 insn per cycle + 1.410433353 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.279021e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279511e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279511e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.615850 sec +INFO: No Floating Point Exceptions have been reported + 2,693,497,833 cycles # 1.663 GHz + 4,273,840,765 instructions # 1.59 insn per cycle + 1.620067219 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 454b0b49ce..3f519fda03 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:01:52 +DATE: 2024-05-16_14:45:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.913726e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.919161e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.919238e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.545415 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.065431e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065819e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066015e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.431749 sec INFO: No Floating Point Exceptions have been reported - 33,072,066,946 cycles:u # 3.456 GHz (75.00%) - 3,412,866 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 6,304,861 stalled-cycles-backend:u # 0.02% backend cycles idle (74.93%) - 26,167,059,493 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.93%) - 9.594535476 seconds time elapsed + 7,858,443,167 cycles # 2.852 GHz + 17,797,449,482 instructions # 2.26 insn per cycle + 2.810886675 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.524370e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527476e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.527501e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.021026 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.189050e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.190883e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.191132e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.007219 sec INFO: No Floating Point Exceptions have been reported - 31,286,281,881 cycles:u # 3.460 GHz (74.94%) - 3,399,984 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 7,513,883 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) - 24,680,671,024 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 9.067324066 seconds time elapsed + 12,388,147,716 cycles # 2.856 GHz + 29,572,084,158 instructions # 2.39 insn per cycle + 4.393669645 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/GPU) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.025495e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.025521e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.025521e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.154224 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.555801e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.556015e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.556015e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.995892 sec INFO: No Floating Point Exceptions have been reported - 18,065,149,337 cycles:u # 3.503 GHz (74.96%) - 29,887,532 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.03%) - 2,278,927,664 stalled-cycles-backend:u # 12.62% backend cycles idle (75.03%) - 55,089,059,533 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.161109340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) + 18,868,341,179 cycles # 2.696 GHz + 53,930,114,085 instructions # 2.86 insn per cycle + 6.999840535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.249808e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.249946e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.249946e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.352174 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.547899e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.547983e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547983e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.415669 sec INFO: No Floating Point Exceptions have been reported - 8,233,739,349 cycles:u # 3.496 GHz (74.90%) - 1,197,145 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 796,084,035 stalled-cycles-backend:u # 9.67% backend cycles idle (74.87%) - 27,074,923,325 instructions:u # 3.29 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 2.359279011 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) + 9,762,163,827 cycles # 2.856 GHz + 27,089,755,364 instructions # 2.77 insn per cycle + 3.419663266 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.161793e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.162463e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.162463e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.027989 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.328520e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.328923e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.328923e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.591769 sec +INFO: No Floating Point Exceptions have been reported + 4,217,350,816 cycles # 2.647 GHz + 9,560,856,496 instructions # 2.27 insn per cycle + 1.595749154 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.765235e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.765802e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.765802e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.407302 sec INFO: No Floating Point Exceptions have been reported - 3,602,949,118 cycles:u # 3.494 GHz (74.77%) - 1,967,387 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.18%) - 315,836,274 stalled-cycles-backend:u # 8.77% backend cycles idle (75.18%) - 9,526,844,066 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (75.18%) - 1.034547423 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) + 3,737,969,275 cycles # 2.650 GHz + 8,484,674,655 instructions # 2.27 insn per cycle + 1.411406372 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.273845e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274322e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274322e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.617639 sec +INFO: No Floating Point Exceptions have been reported + 2,695,774,477 cycles # 1.663 GHz + 4,276,120,388 instructions # 1.59 insn per cycle + 1.621698890 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index ac9055ac64..8097294660 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:03:07 +DATE: 2024-05-16_14:45:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.824649e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.828019e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.828055e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.477029 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.560287e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.561087e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.561509e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.688313 sec INFO: No Floating Point Exceptions have been reported - 15,361,498,399 cycles:u # 3.414 GHz (74.93%) - 2,842,505 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) - 6,326,674 stalled-cycles-backend:u # 0.04% backend cycles idle (74.93%) - 12,512,265,453 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 4.525778900 seconds time elapsed + 5,610,919,333 cycles # 2.843 GHz + 12,076,970,192 instructions # 2.15 insn per cycle + 2.032164963 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.198176e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.211673e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.211860e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.713265 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.335524e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.336187e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336332e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.920912 sec INFO: No Floating Point Exceptions have been reported - 16,170,678,935 cycles:u # 3.415 GHz (75.00%) - 2,706,349 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 6,618,303 stalled-cycles-backend:u # 0.04% backend cycles idle (74.97%) - 13,155,189,747 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 4.757825820 seconds time elapsed + 6,262,064,127 cycles # 2.846 GHz + 13,866,454,713 instructions # 2.21 insn per cycle + 2.256561773 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 9.855155e-03 -Avg ME (F77/GPU) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849636e-03 +Avg ME (F77/GPU) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.099668e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099698e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099698e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.806907 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.473644e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.473896e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.473896e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.236471 sec INFO: No Floating Point Exceptions have been reported - 16,850,062,367 cycles:u # 3.503 GHz (74.92%) - 16,747,767 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.90%) - 1,849,826,643 stalled-cycles-backend:u # 10.98% backend cycles idle (74.98%) - 51,735,478,346 instructions:u # 3.07 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 4.813514314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) + 17,834,532,335 cycles # 2.858 GHz + 53,589,179,622 instructions # 3.00 insn per cycle + 6.240522901 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.616004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.616551e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.616551e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.149551 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.311314e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.311704e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.311704e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.599839 sec INFO: No Floating Point Exceptions have been reported - 4,011,716,086 cycles:u # 3.480 GHz (75.02%) - 355,931 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 387,377,622 stalled-cycles-backend:u # 9.66% backend cycles idle (75.02%) - 13,707,607,644 instructions:u # 3.42 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 1.156443653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) + 4,578,829,094 cycles # 2.856 GHz + 13,761,810,246 instructions # 3.01 insn per cycle + 1.603811766 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.043159e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043429e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.043429e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.511560 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.636666e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.638274e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.638274e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.800488 sec INFO: No Floating Point Exceptions have been reported - 1,789,386,733 cycles:u # 3.476 GHz (74.86%) - 417,214 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.14%) - 147,150,451 stalled-cycles-backend:u # 8.22% backend cycles idle (75.14%) - 4,786,787,148 instructions:u # 2.68 insn per cycle - # 0.03 stalled cycles per insn (75.14%) - 0.518956269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) + 2,129,570,848 cycles # 2.649 GHz + 4,816,093,977 instructions # 2.26 insn per cycle + 0.804523713 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.627354e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.629498e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.629498e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.697299 sec +INFO: No Floating Point Exceptions have been reported + 1,857,131,979 cycles # 2.651 GHz + 4,273,320,598 instructions # 2.30 insn per cycle + 0.701213399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.540089e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.542023e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.542023e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.812936 sec +INFO: No Floating Point Exceptions have been reported + 1,360,618,833 cycles # 1.668 GHz + 2,159,125,772 instructions # 1.59 insn per cycle + 0.816997353 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index b3354e371e..6d352d97ac 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,204 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:27:34 +DATE: 2024-05-16_15:04:47 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.780211e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.780590e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.780590e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 -TOTAL : 4.530503 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.582684e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.584567e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.584567e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.633735 sec INFO: No Floating Point Exceptions have been reported - 15,534,265,664 cycles:u # 3.416 GHz (74.91%) - 2,924,027 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 7,136,647 stalled-cycles-backend:u # 0.05% backend cycles idle (74.90%) - 12,595,844,127 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 4.579830358 seconds time elapsed + 5,453,177,396 cycles # 2.847 GHz + 11,627,188,509 instructions # 2.13 insn per cycle + 1.971399165 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.234213e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.249504e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.249504e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 -TOTAL : 4.738620 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.292389e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.306150e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.306150e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.932378 sec INFO: No Floating Point Exceptions have been reported - 16,253,633,523 cycles:u # 3.414 GHz (74.97%) - 3,512,403 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 48,806,959 stalled-cycles-backend:u # 0.30% backend cycles idle (74.95%) - 13,176,010,291 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 4.783743963 seconds time elapsed + 6,313,737,946 cycles # 2.848 GHz + 13,568,150,990 instructions # 2.15 insn per cycle + 2.274068662 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 9.855155e-03 -Avg ME (F77/GPU) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849636e-03 +Avg ME (F77/GPU) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.091365e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091399e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091399e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.842836 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.447529e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447785e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447785e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.255519 sec INFO: No Floating Point Exceptions have been reported - 16,852,179,806 cycles:u # 3.487 GHz (74.97%) - 17,552,300 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.98%) - 1,836,450,258 stalled-cycles-backend:u # 10.90% backend cycles idle (75.06%) - 51,702,002,628 instructions:u # 3.07 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 4.849558254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) + 17,871,844,477 cycles # 2.856 GHz + 53,590,423,890 instructions # 3.00 insn per cycle + 6.259496797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.613972e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.614519e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.614519e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.150231 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.319002e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319403e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319403e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.595862 sec INFO: No Floating Point Exceptions have been reported - 4,027,365,469 cycles:u # 3.491 GHz (74.99%) - 911,377 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) - 404,696,030 stalled-cycles-backend:u # 10.05% backend cycles idle (75.04%) - 13,704,959,643 instructions:u # 3.40 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 1.157294154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) + 4,573,738,949 cycles # 2.860 GHz + 13,762,785,828 instructions # 3.01 insn per cycle + 1.599904345 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.002697e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.002960e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002960e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.532243 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.613525e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.615218e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.615218e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.804350 sec +INFO: No Floating Point Exceptions have been reported + 2,139,167,872 cycles # 2.648 GHz + 4,817,111,626 instructions # 2.25 insn per cycle + 0.808508083 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.603124e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.605216e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.699524 sec INFO: No Floating Point Exceptions have been reported - 1,795,554,108 cycles:u # 3.409 GHz (74.73%) - 539,252 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.91%) - 147,823,568 stalled-cycles-backend:u # 8.23% backend cycles idle (75.66%) - 4,785,982,688 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.29%) - 0.538772971 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) + 1,862,402,974 cycles # 2.650 GHz + 4,274,167,467 instructions # 2.29 insn per cycle + 0.703628099 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.580110e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.582197e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.582197e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.807828 sec +INFO: No Floating Point Exceptions have been reported + 1,354,037,726 cycles # 1.669 GHz + 2,159,114,420 instructions # 1.59 insn per cycle + 0.811949308 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 73ac9e8d15..2d3f5a3740 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:03:55 +DATE: 2024-05-16_14:46:34 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.781126e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784111e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.784134e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.580585 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.539024e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.539847e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.540280e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.689103 sec INFO: No Floating Point Exceptions have been reported - 15,715,377,910 cycles:u # 3.414 GHz (74.89%) - 2,705,528 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) - 6,427,066 stalled-cycles-backend:u # 0.04% backend cycles idle (74.98%) - 12,774,760,863 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 4.628245906 seconds time elapsed + 5,627,474,622 cycles # 2.847 GHz + 11,923,534,222 instructions # 2.12 insn per cycle + 2.035228412 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.241498e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.257857e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.257952e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.709271 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.303686e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.304329e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.304469e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.932451 sec INFO: No Floating Point Exceptions have been reported - 16,165,125,978 cycles:u # 3.418 GHz (74.97%) - 2,815,066 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 5,618,964 stalled-cycles-backend:u # 0.03% backend cycles idle (74.97%) - 13,105,466,925 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 4.755259043 seconds time elapsed + 6,311,455,519 cycles # 2.848 GHz + 13,762,708,375 instructions # 2.18 insn per cycle + 2.272906437 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 9.855155e-03 -Avg ME (F77/GPU) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 9.849636e-03 +Avg ME (F77/GPU) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.099947e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099978e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099978e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.805319 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.477402e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.477656e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.477656e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.233047 sec INFO: No Floating Point Exceptions have been reported - 16,844,145,885 cycles:u # 3.503 GHz (74.92%) - 16,245,262 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.00%) - 1,694,910,965 stalled-cycles-backend:u # 10.06% backend cycles idle (75.05%) - 51,703,247,218 instructions:u # 3.07 insn per cycle - # 0.03 stalled cycles per insn (75.05%) - 4.812318727 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) + 17,803,580,317 cycles # 2.855 GHz + 53,580,069,164 instructions # 3.01 insn per cycle + 6.237030677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087396841E-003 -Relative difference = 2.119623377106246e-08 +Avg ME (F77/C++) = 9.8479612087582491E-003 +Relative difference = 2.1198118933954545e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.565168e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.565735e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.565735e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.161648 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.307415e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.307805e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.307805e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.601039 sec INFO: No Floating Point Exceptions have been reported - 4,062,894,999 cycles:u # 3.488 GHz (74.63%) - 852,067 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.65%) - 357,776,995 stalled-cycles-backend:u # 8.81% backend cycles idle (74.99%) - 13,727,657,265 instructions:u # 3.38 insn per cycle - # 0.03 stalled cycles per insn (75.28%) - 1.168660469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) + 4,572,009,891 cycles # 2.850 GHz + 13,755,353,111 instructions # 3.01 insn per cycle + 1.605120576 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896225560E-003 +Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.039836e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040097e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.040097e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.513060 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.585961e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.587683e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.587683e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.806516 sec INFO: No Floating Point Exceptions have been reported - 1,810,288,381 cycles:u # 3.506 GHz (74.39%) - 451,194 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.74%) - 157,116,579 stalled-cycles-backend:u # 8.68% backend cycles idle (75.21%) - 4,786,726,252 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (75.21%) - 0.520128633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) + 2,141,220,761 cycles # 2.644 GHz + 4,818,439,860 instructions # 2.25 insn per cycle + 0.810543510 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.583228e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.585349e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.585349e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.701072 sec +INFO: No Floating Point Exceptions have been reported + 1,870,651,613 cycles # 2.656 GHz + 4,275,203,774 instructions # 2.29 insn per cycle + 0.705038579 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.570140e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.572065e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.572065e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.808794 sec +INFO: No Floating Point Exceptions have been reported + 1,356,929,556 cycles # 1.671 GHz + 2,164,613,956 instructions # 1.60 insn per cycle + 0.812781092 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3487) (512y: 34) (512z:79499) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982955140E-003 +Relative difference = 2.0044060904369713e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 77978bf00a..dfab5870bc 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:04:42 +DATE: 2024-05-16_14:47:13 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.524260e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.528987e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.529052e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.815536 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.689455e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.689959e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690212e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.179203 sec INFO: No Floating Point Exceptions have been reported - 34,043,824,193 cycles:u # 3.460 GHz (74.98%) - 3,636,313 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 7,086,491 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) - 26,859,805,206 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.874473184 seconds time elapsed + 7,126,539,551 cycles # 2.849 GHz + 15,807,759,758 instructions # 2.22 insn per cycle + 2.559686036 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.282292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.285215e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.285247e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.357565 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.107889e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108154e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108192e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.424856 sec INFO: No Floating Point Exceptions have been reported - 32,418,132,505 cycles:u # 3.457 GHz (75.01%) - 3,562,303 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,497,796 stalled-cycles-backend:u # 0.02% backend cycles idle (75.01%) - 25,645,021,335 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 9.405477768 seconds time elapsed + 10,729,635,772 cycles # 2.852 GHz + 25,204,058,412 instructions # 2.35 insn per cycle + 3.820430433 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/GPU) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.019602e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.019628e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.019628e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.183972 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.303415e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.303607e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.303607e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 7.238072 sec INFO: No Floating Point Exceptions have been reported - 18,165,747,128 cycles:u # 3.502 GHz (74.99%) - 33,524,590 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.02%) - 2,322,453,909 stalled-cycles-backend:u # 12.78% backend cycles idle (75.02%) - 55,334,612,248 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 5.190865459 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) + 19,150,406,884 cycles # 2.645 GHz + 54,154,394,762 instructions # 2.83 insn per cycle + 7.242308052 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.335635e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.335771e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.335771e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.266045 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.497154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.497235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.497235e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.531757 sec INFO: No Floating Point Exceptions have been reported - 7,931,224,922 cycles:u # 3.495 GHz (74.98%) - 1,871,765 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) - 761,023,176 stalled-cycles-backend:u # 9.60% backend cycles idle (74.97%) - 25,815,110,620 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 2.272824224 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) + 9,343,938,644 cycles # 2.643 GHz + 26,158,830,842 instructions # 2.80 insn per cycle + 3.535758073 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.390146e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.390881e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.390881e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.985055 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.453828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.454275e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.454275e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.534198 sec +INFO: No Floating Point Exceptions have been reported + 4,069,691,610 cycles # 2.648 GHz + 9,228,168,046 instructions # 2.27 insn per cycle + 1.538179495 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.986335e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.986931e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.986931e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.329366 sec INFO: No Floating Point Exceptions have been reported - 3,437,555,760 cycles:u # 3.478 GHz (74.93%) - 1,502,958 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.91%) - 308,584,077 stalled-cycles-backend:u # 8.98% backend cycles idle (74.91%) - 9,068,955,721 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.91%) - 0.991911378 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) + 3,528,184,184 cycles # 2.647 GHz + 8,174,614,993 instructions # 2.32 insn per cycle + 1.333451918 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.367967e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.368468e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.368468e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.573046 sec +INFO: No Floating Point Exceptions have been reported + 2,618,946,865 cycles # 1.661 GHz + 4,154,480,374 instructions # 1.59 insn per cycle + 1.577054610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index e2edcec93b..3ddfb4805b 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-05-15_16:05:58 +DATE: 2024-05-16_14:48:06 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.460765e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.465659e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.465728e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.907182 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.679279e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.679786e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.680054e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.178048 sec INFO: No Floating Point Exceptions have been reported - 34,383,552,437 cycles:u # 3.463 GHz (74.95%) - 3,543,559 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 8,052,686 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 27,130,894,842 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.954843245 seconds time elapsed + 7,134,880,740 cycles # 2.851 GHz + 15,434,594,866 instructions # 2.16 insn per cycle + 2.558453633 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.298172e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.301396e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.301429e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.358072 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.104221e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.104483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104525e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.411600 sec INFO: No Floating Point Exceptions have been reported - 32,411,383,419 cycles:u # 3.456 GHz (75.01%) - 3,392,821 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 6,644,841 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 25,629,804,665 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.405448142 seconds time elapsed + 10,672,973,002 cycles # 2.855 GHz + 24,521,846,399 instructions # 2.30 insn per cycle + 3.794724712 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 9.872263e-03 -Avg ME (F77/GPU) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/GPU) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.022890e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.022916e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.022916e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.166851 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.893217e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.893439e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.893439e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.694276 sec INFO: No Floating Point Exceptions have been reported - 18,110,860,966 cycles:u # 3.503 GHz (74.98%) - 29,566,243 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.94%) - 2,252,660,563 stalled-cycles-backend:u # 12.44% backend cycles idle (74.93%) - 55,408,521,673 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (74.93%) - 5.173774467 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) + 19,121,414,788 cycles # 2.855 GHz + 54,156,458,090 instructions # 2.83 insn per cycle + 6.698138270 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.352806e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.352944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.352944e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.250388 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.495395e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.495480e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.495480e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.535528 sec INFO: No Floating Point Exceptions have been reported - 7,882,617,366 cycles:u # 3.498 GHz (74.80%) - 2,227,961 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.85%) - 819,605,329 stalled-cycles-backend:u # 10.40% backend cycles idle (75.03%) - 25,767,506,663 instructions:u # 3.27 insn per cycle - # 0.03 stalled cycles per insn (75.15%) - 2.257234001 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) + 9,391,010,006 cycles # 2.654 GHz + 26,079,707,862 instructions # 2.78 insn per cycle + 3.539600596 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.481441e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.482179e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.482179e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.968592 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.518532e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.518969e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.518969e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.505616 sec +INFO: No Floating Point Exceptions have been reported + 4,001,150,405 cycles # 2.652 GHz + 9,212,868,850 instructions # 2.30 insn per cycle + 1.509560632 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.985927e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.986486e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.986486e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.329709 sec INFO: No Floating Point Exceptions have been reported - 3,391,090,231 cycles:u # 3.489 GHz (74.49%) - 1,313,193 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.74%) - 318,676,750 stalled-cycles-backend:u # 9.40% backend cycles idle (75.15%) - 9,050,899,584 instructions:u # 2.67 insn per cycle - # 0.04 stalled cycles per insn (75.31%) - 0.975333813 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) + 3,529,740,112 cycles # 2.648 GHz + 8,168,252,869 instructions # 2.31 insn per cycle + 1.333651402 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.359545e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360045e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360045e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.576721 sec +INFO: No Floating Point Exceptions have been reported + 2,623,702,370 cycles # 1.660 GHz + 4,153,356,804 instructions # 1.58 insn per cycle + 1.580839869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 8b0482f7d4..38bc670a18 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:16 +DATE: 2024-05-16_14:42:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe: Segmentation fault - 708,386,539 cycles:u # 0.721 GHz (73.83%) - 2,301,186 stalled-cycles-frontend:u # 0.32% frontend cycles idle (74.20%) - 5,237,056 stalled-cycles-backend:u # 0.74% backend cycles idle (75.42%) - 1,246,636,487 instructions:u # 1.76 insn per cycle - # 0.00 stalled cycles per insn (75.74%) - 1.012706895 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.927387e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.315718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.634653e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.452451 sec +INFO: No Floating Point Exceptions have been reported + 1,879,085,625 cycles # 2.815 GHz + 2,632,406,951 instructions # 1.40 insn per cycle + 0.724903288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe: Segmentation fault - 904,269,601 cycles:u # 2.111 GHz (75.74%) - 2,203,814 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.74%) - 5,510,229 stalled-cycles-backend:u # 0.61% backend cycles idle (74.91%) - 1,374,598,901 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 0.454069271 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.675522e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.208336e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.557322e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.534593 sec +INFO: No Floating Point Exceptions have been reported + 2,165,913,457 cycles # 2.812 GHz + 3,139,398,529 instructions # 1.45 insn per cycle + 0.827804422 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424749e-01 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.011560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033153e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.642980 sec +INFO: No Floating Point Exceptions have been reported + 4,710,402,412 cycles # 2.861 GHz + 13,462,495,012 instructions # 2.86 insn per cycle + 1.647108070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6025e0) on address 0x14609c00a000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.839775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910542e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.913158 sec +INFO: No Floating Point Exceptions have been reported + 2,620,816,977 cycles # 2.859 GHz + 7,551,970,333 instructions # 2.88 insn per cycle + 0.917276709 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.970408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.156692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.156692e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.574933 sec +INFO: No Floating Point Exceptions have been reported + 1,480,758,822 cycles # 2.560 GHz + 3,119,703,419 instructions # 2.11 insn per cycle + 0.579132992 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.428889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.682851e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.682851e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.501020 sec +INFO: No Floating Point Exceptions have been reported + 1,347,520,276 cycles # 2.670 GHz + 2,981,434,055 instructions # 2.21 insn per cycle + 0.505363497 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.241546e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.347861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.347861e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.756225 sec +INFO: No Floating Point Exceptions have been reported + 1,330,320,612 cycles # 1.751 GHz + 1,953,406,018 instructions # 1.47 insn per cycle + 0.760489864 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index a8089ff9c1..6f141963c0 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,78 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:25:24 +DATE: 2024-05-16_15:02:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe: Segmentation fault - 768,592,038 cycles:u # 2.241 GHz (74.56%) - 2,610,445 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.01%) - 30,452,677 stalled-cycles-backend:u # 3.96% backend cycles idle (76.43%) - 1,205,736,295 instructions:u # 1.57 insn per cycle - # 0.03 stalled cycles per insn (76.56%) - 0.371032601 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.428295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.103056e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.103056e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.480636 sec +INFO: No Floating Point Exceptions have been reported + 1,959,891,585 cycles # 2.818 GHz + 2,927,619,706 instructions # 1.49 insn per cycle + 0.752080667 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/gcheck.exe: Segmentation fault - 2,992,221,292 cycles:u # 2.788 GHz (75.58%) - 16,372,327 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.41%) - 841,167,524 stalled-cycles-backend:u # 28.11% backend cycles idle (75.41%) - 3,204,323,336 instructions:u # 1.07 insn per cycle - # 0.26 stalled cycles per insn (74.98%) - 1.094948928 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.157968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.371122e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.371122e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.764001 sec +INFO: No Floating Point Exceptions have been reported + 2,873,640,599 cycles # 2.829 GHz + 4,407,079,803 instructions # 1.53 insn per cycle + 1.073816079 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424749e-01 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.008642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.030371e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030371e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.654853 sec +INFO: No Floating Point Exceptions have been reported + 4,747,034,662 cycles # 2.862 GHz + 13,469,694,473 instructions # 2.84 insn per cycle + 1.659302078 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.820348e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.892211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.892211e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.930951 sec +INFO: No Floating Point Exceptions have been reported + 2,665,977,292 cycles # 2.852 GHz + 7,601,998,240 instructions # 2.85 insn per cycle + 0.935555380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6025e0) on address 0x147dd511a000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.068620e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.272960e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.272960e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.564348 sec +INFO: No Floating Point Exceptions have been reported + 1,513,664,570 cycles # 2.669 GHz + 3,168,463,518 instructions # 2.09 insn per cycle + 0.568761168 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.408389e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.655047e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655047e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.511104 sec +INFO: No Floating Point Exceptions have been reported + 1,377,582,779 cycles # 2.675 GHz + 3,030,644,125 instructions # 2.20 insn per cycle + 0.515560343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.221799e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.329402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329402e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.769820 sec +INFO: No Floating Point Exceptions have been reported + 1,366,102,927 cycles # 1.765 GHz + 1,991,071,116 instructions # 1.46 insn per cycle + 0.774386560 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 0ab4ed0959..25b8d3c885 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:20 +DATE: 2024-05-16_14:43:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/gcheck.exe: Segmentation fault - 759,819,625 cycles:u # 2.319 GHz (70.92%) - 2,381,645 stalled-cycles-frontend:u # 0.31% frontend cycles idle (73.81%) - 5,666,350 stalled-cycles-backend:u # 0.75% backend cycles idle (76.18%) - 1,226,165,010 instructions:u # 1.61 insn per cycle - # 0.00 stalled cycles per insn (75.61%) - 0.353045104 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.907170e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.197971e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.504611e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.455771 sec +INFO: No Floating Point Exceptions have been reported + 1,881,865,516 cycles # 2.813 GHz + 2,669,782,801 instructions # 1.42 insn per cycle + 0.727786761 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/gcheck.exe: Segmentation fault - 888,295,777 cycles:u # 2.066 GHz (75.99%) - 2,126,930 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.76%) - 5,545,150 stalled-cycles-backend:u # 0.62% backend cycles idle (75.88%) - 1,381,325,489 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (75.70%) - 0.454497732 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.641992e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.081273e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.416654e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.532301 sec +INFO: No Floating Point Exceptions have been reported + 2,167,822,822 cycles # 2.823 GHz + 3,120,353,321 instructions # 1.44 insn per cycle + 0.825343283 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424749e-01 +Avg ME (F77/GPU) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.007784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029112e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.648995 sec +INFO: No Floating Point Exceptions have been reported + 4,725,323,359 cycles # 2.860 GHz + 13,457,369,308 instructions # 2.85 insn per cycle + 1.653142214 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x601530) on address 0x14af98559000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.833913e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904030e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.915889 sec +INFO: No Floating Point Exceptions have been reported + 2,628,184,982 cycles # 2.858 GHz + 7,551,273,836 instructions # 2.87 insn per cycle + 0.920086997 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.116183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.320457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.320457e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.548674 sec +INFO: No Floating Point Exceptions have been reported + 1,476,841,675 cycles # 2.675 GHz + 3,117,924,257 instructions # 2.11 insn per cycle + 0.552738607 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.456247e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.706124e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.706124e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.496887 sec +INFO: No Floating Point Exceptions have been reported + 1,340,057,166 cycles # 2.677 GHz + 2,978,732,248 instructions # 2.22 insn per cycle + 0.501058940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.241283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.347840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.347840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.757118 sec +INFO: No Floating Point Exceptions have been reported + 1,329,966,748 cycles # 1.749 GHz + 1,951,787,640 instructions # 1.47 insn per cycle + 0.761356492 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 51deeeaea3..88eaa7d80d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:24 +DATE: 2024-05-16_14:43:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe: Segmentation fault - 704,059,057 cycles:u # 2.157 GHz (76.09%) - 2,115,916 stalled-cycles-frontend:u # 0.30% frontend cycles idle (75.51%) - 5,481,551 stalled-cycles-backend:u # 0.78% backend cycles idle (75.73%) - 1,161,996,833 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (77.74%) - 0.353298917 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.867335e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.223690e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.343650e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.450195 sec +INFO: No Floating Point Exceptions have been reported + 1,886,543,936 cycles # 2.814 GHz + 2,627,629,254 instructions # 1.39 insn per cycle + 0.729554150 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe: Segmentation fault - 798,428,609 cycles:u # 2.151 GHz (74.43%) - 2,320,340 stalled-cycles-frontend:u # 0.29% frontend cycles idle (72.28%) - 5,688,890 stalled-cycles-backend:u # 0.71% backend cycles idle (74.66%) - 1,253,832,398 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (76.66%) - 0.393426066 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.183442e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.842494e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.962990e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.482910 sec +INFO: No Floating Point Exceptions have been reported + 1,994,696,147 cycles # 2.812 GHz + 2,828,466,882 instructions # 1.42 insn per cycle + 0.766894337 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.069532e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.093791e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.553490 sec +INFO: No Floating Point Exceptions have been reported + 4,455,366,971 cycles # 2.862 GHz + 13,047,769,817 instructions # 2.93 insn per cycle + 1.557630020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x5ee5e0) on address 0x14c67eab5000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.869084e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.052765e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.052765e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.592151 sec +INFO: No Floating Point Exceptions have been reported + 1,701,146,602 cycles # 2.856 GHz + 4,512,165,265 instructions # 2.65 insn per cycle + 0.596248693 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.609679e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.315056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.315056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.313413 sec +INFO: No Floating Point Exceptions have been reported + 850,737,642 cycles # 2.684 GHz + 1,895,945,890 instructions # 2.23 insn per cycle + 0.317546154 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.973396e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785303e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.295859 sec +INFO: No Floating Point Exceptions have been reported + 801,819,935 cycles # 2.679 GHz + 1,819,229,849 instructions # 2.27 insn per cycle + 0.299944027 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.354956e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.770974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.770974e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.400076 sec +INFO: No Floating Point Exceptions have been reported + 733,009,701 cycles # 1.817 GHz + 1,304,250,799 instructions # 1.78 insn per cycle + 0.404216975 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index a126c1faff..b62a8a0309 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,78 +1,250 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:25:29 +DATE: 2024-05-16_15:02:40 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe: Segmentation fault - 729,145,754 cycles:u # 2.125 GHz (76.71%) - 2,413,344 stalled-cycles-frontend:u # 0.33% frontend cycles idle (72.78%) - 41,308,952 stalled-cycles-backend:u # 5.67% backend cycles idle (70.29%) - 1,255,558,408 instructions:u # 1.72 insn per cycle - # 0.03 stalled cycles per insn (74.72%) - 0.364988449 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.337579e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.030007e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.030007e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.462122 sec +INFO: No Floating Point Exceptions have been reported + 1,901,719,201 cycles # 2.816 GHz + 2,811,032,752 instructions # 1.48 insn per cycle + 0.731978994 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,823,378,972 cycles:u # 2.845 GHz (75.74%) - 16,838,298 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.91%) - 860,485,057 stalled-cycles-backend:u # 30.48% backend cycles idle (74.04%) - 3,205,772,176 instructions:u # 1.14 insn per cycle - # 0.27 stalled cycles per insn (74.68%) - 1.015525494 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.907303e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.566216e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.566216e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.637098 sec +INFO: No Floating Point Exceptions have been reported + 2,459,040,544 cycles # 2.824 GHz + 3,715,271,980 instructions # 1.51 insn per cycle + 0.927773682 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.068152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.559403 sec +INFO: No Floating Point Exceptions have been reported + 4,475,912,555 cycles # 2.864 GHz + 13,052,235,712 instructions # 2.92 insn per cycle + 1.563691095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.856394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.039884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039884e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.599599 sec +INFO: No Floating Point Exceptions have been reported + 1,723,185,860 cycles # 2.856 GHz + 4,560,285,596 instructions # 2.65 insn per cycle + 0.603925442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x5ee5e0) on address 0x150497215000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.545801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.241062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.241062e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.321141 sec +INFO: No Floating Point Exceptions have been reported + 871,513,310 cycles # 2.683 GHz + 1,932,959,243 instructions # 2.22 insn per cycle + 0.325378385 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.891127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.696072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.696072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.304221 sec +INFO: No Floating Point Exceptions have been reported + 825,995,486 cycles # 2.683 GHz + 1,856,161,781 instructions # 2.25 insn per cycle + 0.308416114 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.307702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.720545e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.720545e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.408443 sec +INFO: No Floating Point Exceptions have been reported + 755,445,387 cycles # 1.833 GHz + 1,345,989,570 instructions # 1.78 insn per cycle + 0.412779323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 04f6ef75ef..f782cd39a5 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:27 +DATE: 2024-05-16_14:43:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/gcheck.exe: Segmentation fault - 722,389,001 cycles:u # 2.213 GHz (73.26%) - 2,066,765 stalled-cycles-frontend:u # 0.29% frontend cycles idle (76.17%) - 5,958,775 stalled-cycles-backend:u # 0.82% backend cycles idle (75.52%) - 1,198,904,032 instructions:u # 1.66 insn per cycle - # 0.00 stalled cycles per insn (75.64%) - 0.353053631 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.882997e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.225822e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.344729e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.447687 sec +INFO: No Floating Point Exceptions have been reported + 1,891,564,072 cycles # 2.816 GHz + 2,660,739,786 instructions # 1.41 insn per cycle + 0.729746219 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/gcheck.exe: Segmentation fault - 800,574,835 cycles:u # 2.111 GHz (75.14%) - 2,111,514 stalled-cycles-frontend:u # 0.26% frontend cycles idle (77.40%) - 4,989,204 stalled-cycles-backend:u # 0.62% backend cycles idle (75.40%) - 1,371,821,186 instructions:u # 1.71 insn per cycle - # 0.00 stalled cycles per insn (72.18%) - 0.402319566 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.107850e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805980e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.921999e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.485790 sec +INFO: No Floating Point Exceptions have been reported + 1,996,906,378 cycles # 2.807 GHz + 2,867,667,096 instructions # 1.44 insn per cycle + 0.769333150 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424226e-01 +Avg ME (F77/GPU) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.069812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.094168e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.094168e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.553096 sec +INFO: No Floating Point Exceptions have been reported + 4,454,505,799 cycles # 2.862 GHz + 13,029,391,838 instructions # 2.92 insn per cycle + 1.557292510 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x5ed530) on address 0x146a8b46c000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.876347e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060596e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.590255 sec +INFO: No Floating Point Exceptions have been reported + 1,693,495,983 cycles # 2.852 GHz + 4,508,141,451 instructions # 2.66 insn per cycle + 0.594398488 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.574680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.273652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.273652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.315080 sec +INFO: No Floating Point Exceptions have been reported + 851,359,645 cycles # 2.672 GHz + 1,893,112,803 instructions # 2.22 insn per cycle + 0.319204462 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.978403e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785893e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.295093 sec +INFO: No Floating Point Exceptions have been reported + 799,712,323 cycles # 2.678 GHz + 1,814,979,638 instructions # 2.27 insn per cycle + 0.299228201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.317992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.737735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.737735e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.402763 sec +INFO: No Floating Point Exceptions have been reported + 736,511,578 cycles # 1.812 GHz + 1,302,115,541 instructions # 1.77 insn per cycle + 0.406867415 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1936) (512y: 32) (512z: 2382) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 49a0535746..265a4a7626 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:30 +DATE: 2024-05-16_14:43:39 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/gcheck.exe: Segmentation fault - 750,335,748 cycles:u # 2.276 GHz (70.51%) - 2,089,979 stalled-cycles-frontend:u # 0.28% frontend cycles idle (73.95%) - 5,698,519 stalled-cycles-backend:u # 0.76% backend cycles idle (76.35%) - 1,212,281,398 instructions:u # 1.62 insn per cycle - # 0.00 stalled cycles per insn (75.76%) - 0.353482563 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.940149e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.336219e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.662963e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.452840 sec +INFO: No Floating Point Exceptions have been reported + 1,880,363,198 cycles # 2.808 GHz + 2,677,692,820 instructions # 1.42 insn per cycle + 0.726161506 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/gcheck.exe: Segmentation fault - 926,771,584 cycles:u # 2.147 GHz (75.14%) - 2,293,476 stalled-cycles-frontend:u # 0.25% frontend cycles idle (72.10%) - 5,492,410 stalled-cycles-backend:u # 0.59% backend cycles idle (73.93%) - 1,353,222,126 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (76.26%) - 0.453968445 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.684159e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236315e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588311e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.530728 sec +INFO: No Floating Point Exceptions have been reported + 2,164,642,485 cycles # 2.821 GHz + 3,145,530,012 instructions # 1.45 insn per cycle + 0.824333778 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424749e-01 +Avg ME (F77/GPU) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.003476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024445e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024445e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.655769 sec +INFO: No Floating Point Exceptions have been reported + 4,745,491,139 cycles # 2.860 GHz + 13,466,039,366 instructions # 2.84 insn per cycle + 1.659848552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6025e0) on address 0x1537cffba000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.849332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920343e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.908133 sec +INFO: No Floating Point Exceptions have been reported + 2,605,721,632 cycles # 2.858 GHz + 7,384,650,569 instructions # 2.83 insn per cycle + 0.912227813 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.133010e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.340359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340359e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.546033 sec +INFO: No Floating Point Exceptions have been reported + 1,469,888,298 cycles # 2.674 GHz + 3,055,461,884 instructions # 2.08 insn per cycle + 0.550169150 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.544324e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.807645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.807645e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.485213 sec +INFO: No Floating Point Exceptions have been reported + 1,307,959,720 cycles # 2.676 GHz + 2,930,377,532 instructions # 2.24 insn per cycle + 0.489382978 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.172350e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.272043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.272043e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.779592 sec +INFO: No Floating Point Exceptions have been reported + 1,368,592,699 cycles # 1.747 GHz + 1,969,378,714 instructions # 1.44 insn per cycle + 0.783958712 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f9c8bb0940..84e80111cc 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,70 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-05-15_16:00:34 +DATE: 2024-05-16_14:43:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/gcheck.exe: Segmentation fault - 694,936,346 cycles:u # 2.113 GHz (75.86%) - 2,256,716 stalled-cycles-frontend:u # 0.32% frontend cycles idle (73.76%) - 5,727,296 stalled-cycles-backend:u # 0.82% backend cycles idle (71.33%) - 1,273,451,792 instructions:u # 1.83 insn per cycle - # 0.00 stalled cycles per insn (73.85%) - 0.356041379 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.890956e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.181054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.513059e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.453973 sec +INFO: No Floating Point Exceptions have been reported + 1,876,167,670 cycles # 2.808 GHz + 2,662,885,558 instructions # 1.42 insn per cycle + 0.726739496 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/gcheck.exe: Segmentation fault - 890,197,611 cycles:u # 2.060 GHz (76.21%) - 2,053,177 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.95%) - 5,438,472 stalled-cycles-backend:u # 0.61% backend cycles idle (76.00%) - 1,324,555,031 instructions:u # 1.49 insn per cycle - # 0.00 stalled cycles per insn (75.78%) - 0.454474968 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.642147e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.081360e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.416296e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.533908 sec +INFO: No Floating Point Exceptions have been reported + 2,163,893,097 cycles # 2.818 GHz + 3,132,561,280 instructions # 1.45 insn per cycle + 0.826852700 seconds time elapsed +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 1.424749e-01 +Avg ME (F77/GPU) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) +========================================================================= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.007176e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.028375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.650928 sec +INFO: No Floating Point Exceptions have been reported + 4,733,031,285 cycles # 2.861 GHz + 13,451,191,160 instructions # 2.84 insn per cycle + 1.655053441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x601530) on address 0x14e1e6159000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.847760e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.919370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.919370e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.909156 sec +INFO: No Floating Point Exceptions have been reported + 2,606,818,939 cycles # 2.857 GHz + 7,388,977,556 instructions # 2.83 insn per cycle + 0.913243210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.915489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.093943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.093943e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.585235 sec +INFO: No Floating Point Exceptions have been reported + 1,469,957,671 cycles # 2.496 GHz + 3,055,084,256 instructions # 2.08 insn per cycle + 0.589443028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.535422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.797003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797003e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.486063 sec +INFO: No Floating Point Exceptions have been reported + 1,306,700,125 cycles # 2.669 GHz + 2,930,583,524 instructions # 2.24 insn per cycle + 0.490171496 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.173668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.273111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273111e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.778991 sec +INFO: No Floating Point Exceptions have been reported + 1,367,910,665 cycles # 1.749 GHz + 1,969,371,455 instructions # 1.44 insn per cycle + 0.783143035 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index ae6d2951a4..8af6873425 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:44:17 +DATE: 2024-05-16_15:20:33 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.603796e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.458436e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.502036e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.383101 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.588343e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081541e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.176224e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.528808 sec INFO: No Floating Point Exceptions have been reported - 939,106,630 cycles:u # 2.343 GHz (74.25%) - 2,223,440 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.30%) - 5,823,072 stalled-cycles-backend:u # 0.62% backend cycles idle (74.65%) - 1,484,202,719 instructions:u # 1.58 insn per cycle - # 0.00 stalled cycles per insn (75.99%) - 0.435959895 seconds time elapsed + 2,192,111,166 cycles # 2.821 GHz + 3,135,008,318 instructions # 1.43 insn per cycle + 0.833908791 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134710926110271 -Relative difference = 2.1036162350152416e-07 +Avg ME (F77/GPU) = 4.3134710926110280 +Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.328488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.384816e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.384816e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 4.686474 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.865233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.915227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915227e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 5.734356 sec INFO: No Floating Point Exceptions have been reported - 16,124,595,636 cycles:u # 3.433 GHz (74.93%) - 9,541,345 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 3,653,068,162 stalled-cycles-backend:u # 22.66% backend cycles idle (74.97%) - 41,670,912,640 instructions:u # 2.58 insn per cycle - # 0.09 stalled cycles per insn (74.97%) - 4.701938773 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 16,430,057,220 cycles # 2.863 GHz + 42,484,854,801 instructions # 2.59 insn per cycle + 5.739849036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 711) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.051872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.236166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.236166e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.774123 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.235376e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.401567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.401567e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.353450 sec INFO: No Floating Point Exceptions have been reported - 9,335,501,881 cycles:u # 3.349 GHz (75.04%) - 8,987,352 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.04%) - 1,824,361,010 stalled-cycles-backend:u # 19.54% backend cycles idle (75.03%) - 26,239,572,430 instructions:u # 2.81 insn per cycle - # 0.07 stalled cycles per insn (75.04%) - 3.094497404 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2294) (avx2: 0) (512y: 0) (512z: 0) + 9,612,345,009 cycles # 2.863 GHz + 26,317,248,003 instructions # 2.74 insn per cycle + 3.358813940 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2388) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105804 Relative difference = 2.103617270732513e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.359529e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.906295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.906295e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.611971 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.244474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.678972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.678972e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.111219 sec +INFO: No Floating Point Exceptions have been reported + 5,673,148,574 cycles # 2.682 GHz + 12,029,125,150 instructions # 2.12 insn per cycle + 2.116589548 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2532) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.759844e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.282682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.282682e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 1.934603 sec INFO: No Floating Point Exceptions have been reported - 5,295,374,231 cycles:u # 3.258 GHz (74.95%) - 7,540,695 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.90%) - 487,371,265 stalled-cycles-backend:u # 9.20% backend cycles idle (74.92%) - 12,305,551,328 instructions:u # 2.32 insn per cycle - # 0.04 stalled cycles per insn (74.92%) - 1.629855341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2548) (512y: 0) (512z: 0) + 5,185,525,755 cycles # 2.675 GHz + 11,158,849,555 instructions # 2.15 insn per cycle + 1.940086470 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2195) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.492671e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.676216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.676216e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.114794 sec +INFO: No Floating Point Exceptions have been reported + 5,530,850,143 cycles # 1.773 GHz + 8,071,834,418 instructions # 1.46 insn per cycle + 3.120392658 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1471) (512y: 129) (512z: 1684) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index bbcfb1ad25..746b04ecac 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:44:31 +DATE: 2024-05-16_15:20:58 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.527768e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.394131e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.433838e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.402263 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.594523e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092654e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188255e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.533303 sec INFO: No Floating Point Exceptions have been reported - 932,334,891 cycles:u # 2.280 GHz (74.47%) - 2,201,257 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.41%) - 5,652,546 stalled-cycles-backend:u # 0.61% backend cycles idle (75.14%) - 1,478,530,745 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 0.458497357 seconds time elapsed + 2,159,610,833 cycles # 2.816 GHz + 3,095,961,302 instructions # 1.43 insn per cycle + 0.825364511 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 -Avg ME (F77/GPU) = 4.3134710926110271 -Relative difference = 2.1036162350152416e-07 +Avg ME (F77/GPU) = 4.3134710926110280 +Relative difference = 2.1036162329561614e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.337798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.394950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.394950e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 4.674231 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.884407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935333e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935333e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 5.676327 sec INFO: No Floating Point Exceptions have been reported - 15,926,402,288 cycles:u # 3.398 GHz (74.95%) - 9,876,431 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) - 35,516,406 stalled-cycles-backend:u # 0.22% backend cycles idle (74.91%) - 42,467,880,880 instructions:u # 2.67 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 4.691836216 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 568) (avx2: 0) (512y: 0) (512z: 0) + 16,262,813,557 cycles # 2.863 GHz + 43,266,807,177 instructions # 2.66 insn per cycle + 5.681729392 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 662) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105795 Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.273752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.480636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.480636e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.640756 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.290556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.463505e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.463505e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.298765 sec INFO: No Floating Point Exceptions have been reported - 8,858,395,530 cycles:u # 3.338 GHz (75.01%) - 9,289,255 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.98%) - 674,552,327 stalled-cycles-backend:u # 7.61% backend cycles idle (74.98%) - 25,064,616,066 instructions:u # 2.83 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 2.658338391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2130) (avx2: 0) (512y: 0) (512z: 0) + 9,454,937,516 cycles # 2.862 GHz + 25,430,832,847 instructions # 2.69 insn per cycle + 3.304226277 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2268) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926105804 Relative difference = 2.103617270732513e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.611385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.050599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.050599e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.772426 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.695348e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.042916e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.042916e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.345615 sec +INFO: No Floating Point Exceptions have been reported + 6,296,882,273 cycles # 2.679 GHz + 13,638,682,807 instructions # 2.17 insn per cycle + 2.351107442 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2629) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.910957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.286382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.286382e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.246822 sec INFO: No Floating Point Exceptions have been reported - 5,851,359,243 cycles:u # 3.277 GHz (74.95%) - 9,366,635 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.91%) - 1,273,367,798 stalled-cycles-backend:u # 21.76% backend cycles idle (74.92%) - 13,603,531,303 instructions:u # 2.32 insn per cycle - # 0.09 stalled cycles per insn (74.93%) - 1.789679630 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2610) (512y: 0) (512z: 0) + 6,026,491,701 cycles # 2.677 GHz + 12,722,860,113 instructions # 2.11 insn per cycle + 2.252413644 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134710926107935 Relative difference = 2.103616776553298e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.420299e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596534e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.177504 sec +INFO: No Floating Point Exceptions have been reported + 5,627,100,070 cycles # 1.769 GHz + 8,928,441,764 instructions # 1.59 insn per cycle + 3.183062200 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1357) (512y: 171) (512z: 1777) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134710926107935 +Relative difference = 2.103616776553298e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index fbecf1d626..a9079e9716 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -1,181 +1,228 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:44:45 +DATE: 2024-05-16_15:21:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.357473e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.206105e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296255e+08 ) sec^-1 -MeanMatrixElemValue = ( 6.834692e+00 +- 1.463624e-01 ) GeV^0 -TOTAL : 0.325039 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.566221e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.504693e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.775023e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 +TOTAL : 0.485925 sec INFO: No Floating Point Exceptions have been reported - 766,423,261 cycles:u # 2.194 GHz (73.78%) - 2,101,254 stalled-cycles-frontend:u # 0.27% frontend cycles idle (73.18%) - 5,549,032 stalled-cycles-backend:u # 0.72% backend cycles idle (74.86%) - 1,246,921,500 instructions:u # 1.63 insn per cycle - # 0.00 stalled cycles per insn (75.94%) - 0.378180737 seconds time elapsed + 2,003,287,538 cycles # 2.816 GHz + 2,880,414,118 instructions # 1.44 insn per cycle + 0.769648039 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 4.313524e+00 -Avg ME (F77/GPU) = 4.3135525460820645 -Relative difference = 6.617809954082434e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.313490e+00 +Avg ME (F77/GPU) = 4.3136695463908836 +Relative difference = 4.162439020000051e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.691865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.764922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.764922e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 4.045298 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.938364e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.994818e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.994818e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 5.499884 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 13,816,308,539 cycles:u # 3.408 GHz (74.96%) - 17,510,892 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.94%) - 1,202,254,806 stalled-cycles-backend:u # 8.70% backend cycles idle (74.95%) - 40,754,489,444 instructions:u # 2.95 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 4.058642606 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 598) (avx2: 0) (512y: 0) (512z: 0) + 15,743,516,639 cycles # 2.861 GHz + 42,225,863,593 instructions # 2.68 insn per cycle + 5.505101290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 601) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135737644042820 -Relative difference = 5.461728906135488e-08 +Avg ME (F77/C++) = 4.3135739049175754 +Relative difference = 2.2042608890083832e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 5.928754e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.311008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.311008e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 1.917449 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.494085e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.834702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.834702e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 2.423560 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,451,480,565 cycles:u # 3.350 GHz (75.04%) - 12,106,927 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.08%) - 2,462,093,126 stalled-cycles-backend:u # 38.16% backend cycles idle (75.08%) - 16,252,754,111 instructions:u # 2.52 insn per cycle - # 0.15 stalled cycles per insn (75.08%) - 1.930581694 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2735) (avx2: 0) (512y: 0) (512z: 0) + 6,948,197,620 cycles # 2.861 GHz + 16,919,710,710 instructions # 2.44 insn per cycle + 2.428887408 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2983) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Avg ME (C++/C++) = 4.313573e+00 -Avg ME (F77/C++) = 4.3135733148083091 -Relative difference = 7.298086973342306e-08 +Avg ME (C++/C++) = 4.313572e+00 +Avg ME (F77/C++) = 4.3135722205042839 +Relative difference = 5.111872113533787e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.154310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.294552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.294552e+06 ) sec^-1 -MeanMatrixElemValue = ( 7.289197e+00 +- 1.809101e-01 ) GeV^0 -TOTAL : 1.057858 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.820914e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.816967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.816967e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.429543 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,495,103,075 cycles:u # 3.277 GHz (74.56%) - 8,963,767 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.80%) - 1,195,464,933 stalled-cycles-backend:u # 34.20% backend cycles idle (75.17%) - 8,024,650,910 instructions:u # 2.30 insn per cycle - # 0.15 stalled cycles per insn (75.25%) - 1.070740395 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3309) (512y: 0) (512z: 0) + 3,855,960,900 cycles # 2.689 GHz + 7,989,689,028 instructions # 2.07 insn per cycle + 1.434693752 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135650876211002 -Relative difference = 2.03129199623388e-08 +Avg ME (F77/C++) = 4.3135645699221641 +Relative difference = 9.97035713074993e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.282128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.407558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.407558e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.355217 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 3,662,603,190 cycles # 2.693 GHz + 7,491,885,625 instructions # 2.05 insn per cycle + 1.360533114 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3036) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313565e+00 +Avg ME (F77/C++) = 4.3135645699221641 +Relative difference = 9.97035713074993e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.072932e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.653576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.653576e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.816585 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 3,322,287,385 cycles # 1.825 GHz + 5,988,754,595 instructions # 1.80 insn per cycle + 1.821834164 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2418) (512y: 32) (512z: 2031) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313564e+00 +Avg ME (F77/C++) = 4.3135643783025444 +Relative difference = 8.770069111236825e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 61935ba277..0359df7b77 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -1,181 +1,228 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:44:57 +DATE: 2024-05-16_15:21:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.877469e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.434218e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.544198e+08 ) sec^-1 -MeanMatrixElemValue = ( 6.834692e+00 +- 1.463624e-01 ) GeV^0 -TOTAL : 0.328664 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.575897e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.505600e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.778243e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 +TOTAL : 0.488373 sec INFO: No Floating Point Exceptions have been reported - 781,773,953 cycles:u # 2.213 GHz (75.17%) - 2,197,289 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.96%) - 4,528,950 stalled-cycles-backend:u # 0.58% backend cycles idle (74.26%) - 1,362,708,003 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (74.60%) - 0.381588531 seconds time elapsed + 2,007,752,645 cycles # 2.812 GHz + 2,828,437,251 instructions # 1.41 insn per cycle + 0.772837040 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 4.313524e+00 -Avg ME (F77/GPU) = 4.3135525460820645 -Relative difference = 6.617809954082434e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.313490e+00 +Avg ME (F77/GPU) = 4.3136695463908836 +Relative difference = 4.162439020000051e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.695325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.769333e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.769333e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 4.041029 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.991117e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050649e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 5.356246 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 13,837,051,782 cycles:u # 3.417 GHz (74.88%) - 17,812,024 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.92%) - 460,256,248 stalled-cycles-backend:u # 3.33% backend cycles idle (74.92%) - 41,450,204,858 instructions:u # 3.00 insn per cycle - # 0.01 stalled cycles per insn (74.99%) - 4.054379804 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 555) (avx2: 0) (512y: 0) (512z: 0) + 15,339,535,429 cycles # 2.862 GHz + 42,474,905,629 instructions # 2.77 insn per cycle + 5.361339903 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135737563716248 -Relative difference = 5.647947044645654e-08 +Avg ME (F77/C++) = 4.3135739491553977 +Relative difference = 1.1787117204016727e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.985228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.519730e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.519730e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.315915e+00 +- 1.953829e-01 ) GeV^0 -TOTAL : 1.650076 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.134209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.583662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.583662e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 +TOTAL : 2.132369 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,523,168,879 cycles:u # 3.330 GHz (74.94%) - 12,695,760 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.92%) - 1,257,259,572 stalled-cycles-backend:u # 22.76% backend cycles idle (74.92%) - 16,074,513,667 instructions:u # 2.91 insn per cycle - # 0.08 stalled cycles per insn (74.94%) - 1.662951394 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2557) (avx2: 0) (512y: 0) (512z: 0) + 6,119,263,046 cycles # 2.864 GHz + 16,261,701,502 instructions # 2.66 insn per cycle + 2.137647028 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2702) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -Avg ME (C++/C++) = 4.313573e+00 -Avg ME (F77/C++) = 4.3135733148083091 -Relative difference = 7.298086973342306e-08 +Avg ME (C++/C++) = 4.313572e+00 +Avg ME (F77/C++) = 4.3135722205042839 +Relative difference = 5.111872113533787e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 8.429409e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.164470e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.164470e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.289198e+00 +- 1.809101e-01 ) GeV^0 -TOTAL : 1.392686 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.498649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.173623e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.173623e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.703269 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,615,911,149 cycles:u # 3.294 GHz (74.95%) - 9,345,938 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.89%) - 1,844,391,987 stalled-cycles-backend:u # 39.96% backend cycles idle (74.89%) - 10,105,850,224 instructions:u # 2.19 insn per cycle - # 0.18 stalled cycles per insn (74.90%) - 1.405788920 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3914) (512y: 0) (512z: 0) + 4,581,699,390 cycles # 2.683 GHz + 9,041,394,873 instructions # 1.97 insn per cycle + 1.708700782 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3558) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135650896001607 -Relative difference = 2.0771719231865954e-08 +Avg ME (F77/C++) = 4.3135645687580109 +Relative difference = 9.997345323075056e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.705142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.424759e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.424759e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.652652 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 4,411,023,052 cycles # 2.662 GHz + 8,532,140,610 instructions # 1.93 insn per cycle + 1.658018216 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3311) (512y: 10) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313565e+00 +Avg ME (F77/C++) = 4.3135645687580109 +Relative difference = 9.997345323075056e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.118773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.709641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.709641e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 +TOTAL : 1.803301 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 3,302,699,675 cycles # 1.827 GHz + 5,958,419,273 instructions # 1.80 insn per cycle + 1.808538430 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2357) (512y: 32) (512z: 2014) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313564e+00 +Avg ME (F77/C++) = 4.3135643783025444 +Relative difference = 8.770069111236825e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 6fca09810a..4345b3c851 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -1,181 +1,228 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:45:09 +DATE: 2024-05-16_15:22:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.555583e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.455825e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497430e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.382816 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.596790e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.087710e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.182609e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.531791 sec INFO: No Floating Point Exceptions have been reported - 928,372,758 cycles:u # 2.270 GHz (75.49%) - 2,253,270 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.70%) - 5,952,343 stalled-cycles-backend:u # 0.64% backend cycles idle (74.88%) - 1,478,001,511 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (74.89%) - 0.437350694 seconds time elapsed + 2,158,479,665 cycles # 2.816 GHz + 3,115,947,911 instructions # 1.44 insn per cycle + 0.824595914 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134711012809239 Relative difference = 2.0835166567625394e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.308780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.363378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.363378e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 4.728958 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.739846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.783362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.783362e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 6.139062 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 16,171,155,664 cycles:u # 3.411 GHz (74.92%) - 38,149,325 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.93%) - 2,405,972,446 stalled-cycles-backend:u # 14.88% backend cycles idle (75.01%) - 41,551,815,411 instructions:u # 2.57 insn per cycle - # 0.06 stalled cycles per insn (75.03%) - 4.746198211 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 635) (avx2: 0) (512y: 0) (512z: 0) + 17,579,172,412 cycles # 2.862 GHz + 41,767,715,738 instructions # 2.38 insn per cycle + 6.144566394 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134711778082178 Relative difference = 1.906102050071626e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.244198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.448974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.448974e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.656590 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.944235e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080846e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.674671 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 8,978,630,703 cycles:u # 3.363 GHz (74.87%) - 15,441,593 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.83%) - 1,827,739,952 stalled-cycles-backend:u # 20.36% backend cycles idle (74.93%) - 25,970,891,941 instructions:u # 2.89 insn per cycle - # 0.07 stalled cycles per insn (75.08%) - 2.673866298 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2393) (avx2: 0) (512y: 0) (512z: 0) + 10,157,870,701 cycles # 2.761 GHz + 26,355,211,403 instructions # 2.59 insn per cycle + 3.680088821 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134711778082178 Relative difference = 1.906102050071626e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.363289e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.903132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.903132e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.612163 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.512494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.830362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.830362e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.435332 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 6,512,604,303 cycles # 2.669 GHz + 12,120,159,732 instructions # 1.86 insn per cycle + 2.440902409 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2718) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712319139954 +Relative difference = 1.7806676491157786e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.920988e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.300442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.300442e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.244169 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 5,294,121,423 cycles:u # 3.258 GHz (74.95%) - 18,540,101 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.90%) - 1,333,246,582 stalled-cycles-backend:u # 25.18% backend cycles idle (74.91%) - 12,289,556,764 instructions:u # 2.32 insn per cycle - # 0.11 stalled cycles per insn (74.91%) - 1.629350250 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2788) (512y: 0) (512z: 0) + 6,018,583,564 cycles # 2.676 GHz + 11,228,279,694 instructions # 1.87 insn per cycle + 2.249711111 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2369) (512y: 150) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134712319139954 Relative difference = 1.7806676491157786e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.148571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.297302e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297302e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.442171 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 6,072,730,798 cycles # 1.762 GHz + 8,215,005,190 instructions # 1.35 insn per cycle + 3.447734816 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1787) (512y: 134) (512z: 1755) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712319139954 +Relative difference = 1.7806676491157786e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index 2e6d2d9565..fc67fec042 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -1,181 +1,228 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-05-15_16:45:24 +DATE: 2024-05-16_15:22:31 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.543953e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.380252e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.419621e+07 ) sec^-1 -MeanMatrixElemValue = ( 7.088120e+00 +- 1.629041e-01 ) GeV^0 -TOTAL : 0.380255 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.615689e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096145e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193163e+08 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 0.527662 sec INFO: No Floating Point Exceptions have been reported - 914,534,531 cycles:u # 2.249 GHz (74.00%) - 2,062,532 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.34%) - 5,541,049 stalled-cycles-backend:u # 0.61% backend cycles idle (74.67%) - 1,426,424,976 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (76.24%) - 0.436887536 seconds time elapsed + 2,187,091,067 cycles # 2.822 GHz + 3,143,599,790 instructions # 1.44 insn per cycle + 0.831715891 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313472e+00 Avg ME (F77/GPU) = 4.3134711012809239 Relative difference = 2.0835166567625394e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.297575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.351788e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.351788e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 4.749421 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.750132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.794255e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.794255e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 6.103500 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 16,188,336,443 cycles:u # 3.399 GHz (74.99%) - 32,936,332 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.97%) - 56,339,433 stalled-cycles-backend:u # 0.35% backend cycles idle (74.97%) - 42,758,696,712 instructions:u # 2.64 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 4.766796730 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 577) (avx2: 0) (512y: 0) (512z: 0) + 17,473,867,626 cycles # 2.861 GHz + 43,052,630,037 instructions # 2.46 insn per cycle + 6.108967949 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134711778082178 Relative difference = 1.906102050071626e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.205143e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.404626e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.404626e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 2.678632 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.176372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.336517e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.336517e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.414423 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 9,037,296,715 cycles:u # 3.357 GHz (74.97%) - 15,503,386 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) - 1,245,986,582 stalled-cycles-backend:u # 13.79% backend cycles idle (75.04%) - 24,980,136,778 instructions:u # 2.76 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 2.695735212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2226) (avx2: 0) (512y: 0) (512z: 0) + 9,783,940,024 cycles # 2.862 GHz + 25,167,910,576 instructions # 2.57 insn per cycle + 3.420037518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134711778082178 Relative difference = 1.906102050071626e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_HEFT_GG_BBX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.402147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.813045e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.813045e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.211102e+00 +- 1.606204e-01 ) GeV^0 -TOTAL : 1.823690 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.178030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.451835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.451835e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.622185 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 7,019,924,583 cycles # 2.672 GHz + 12,790,606,448 instructions # 1.82 insn per cycle + 2.627804246 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2699) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712319139954 +Relative difference = 1.7806676491157786e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.488078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.801083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.801083e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 2.447720 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,060,962,198 cycles:u # 3.300 GHz (74.81%) - 18,276,032 stalled-cycles-frontend:u # 0.30% frontend cycles idle (75.03%) - 697,613,701 stalled-cycles-backend:u # 11.51% backend cycles idle (75.17%) - 13,609,061,030 instructions:u # 2.25 insn per cycle - # 0.05 stalled cycles per insn (75.17%) - 1.840736491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2930) (512y: 0) (512z: 0) + 6,546,937,322 cycles # 2.670 GHz + 12,109,881,739 instructions # 1.85 insn per cycle + 2.453162643 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2351) (512y: 227) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313472e+00 Avg ME (F77/C++) = 4.3134712319139954 Relative difference = 1.7806676491157786e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.983756e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.117708e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.117708e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 +TOTAL : 3.627336 sec +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW + 6,377,996,877 cycles # 1.756 GHz + 8,984,744,450 instructions # 1.41 insn per cycle + 3.632964633 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1892) (512y: 178) (512z: 2083) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW +Avg ME (C++/C++) = 4.313472e+00 +Avg ME (F77/C++) = 4.3134712319139954 +Relative difference = 1.7806676491157786e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 88c906df0a..f2a95b68c4 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_16:43:44 +DATE: 2024-05-16_15:19:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.571380e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.274438e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.280380e+03 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.397939 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.205899e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.229515e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.233614e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.467479 sec INFO: No Floating Point Exceptions have been reported - 1,042,961,084 cycles:u # 2.645 GHz (72.61%) - 2,254,086 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.57%) - 5,624,084 stalled-cycles-backend:u # 0.54% backend cycles idle (75.57%) - 1,460,383,473 instructions:u # 1.40 insn per cycle - # 0.00 stalled cycles per insn (75.70%) - 0.444657235 seconds time elapsed + 1,929,394,895 cycles # 2.809 GHz + 2,774,653,842 instructions # 1.44 insn per cycle + 0.745241861 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.637055e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.860714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.862696e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.446265 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.854750e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.994181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.003911e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.485365 sec INFO: No Floating Point Exceptions have been reported - 1,096,603,690 cycles:u # 2.605 GHz (74.76%) - 2,298,126 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.32%) - 5,734,466 stalled-cycles-backend:u # 0.52% backend cycles idle (75.32%) - 1,520,111,994 instructions:u # 1.39 insn per cycle - # 0.00 stalled cycles per insn (75.47%) - 0.492332333 seconds time elapsed + 1,990,830,698 cycles # 2.816 GHz + 2,942,277,354 instructions # 1.48 insn per cycle + 0.765598417 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176587E-006 -Relative difference = 3.3392753387325367e-07 +Avg ME (F77/GPU) = 8.1274562860176604E-006 +Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.055534e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.059258e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.059258e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.136200 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.339413e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.342602e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.342602e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.163415 sec INFO: No Floating Point Exceptions have been reported - 469,638,655 cycles:u # 3.358 GHz (72.85%) - 305,767 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.72%) - 73,244,886 stalled-cycles-backend:u # 15.60% backend cycles idle (77.13%) - 1,384,227,102 instructions:u # 2.95 insn per cycle - # 0.05 stalled cycles per insn (77.13%) - 0.143889231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1926) (avx2: 0) (512y: 0) (512z: 0) + 474,956,853 cycles # 2.847 GHz + 1,396,923,375 instructions # 2.94 insn per cycle + 0.167372542 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3991) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860167168E-006 -Relative difference = 3.3392764976441195e-07 +Avg ME (F77/C++) = 8.1274562860167185E-006 +Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.104355e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.122659e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.122659e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.063783 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.350685e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.362490e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.362490e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.088282 sec INFO: No Floating Point Exceptions have been reported - 221,158,312 cycles:u # 3.286 GHz (72.89%) - 91,517 stalled-cycles-frontend:u # 0.04% frontend cycles idle (76.27%) - 21,802,614 stalled-cycles-backend:u # 9.86% backend cycles idle (76.25%) - 658,008,213 instructions:u # 2.98 insn per cycle - # 0.03 stalled cycles per insn (76.25%) - 0.071927257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9270) (avx2: 0) (512y: 0) (512z: 0) + 246,129,842 cycles # 2.680 GHz + 699,160,574 instructions # 2.84 insn per cycle + 0.092454839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9501) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.092186e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.101709e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.101709e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.031010 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.421076e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.426847e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.426847e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.042262 sec +INFO: No Floating Point Exceptions have been reported + 120,513,094 cycles # 2.641 GHz + 260,079,134 instructions # 2.16 insn per cycle + 0.046206481 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8227) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.614262e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.622122e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622122e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.037855 sec INFO: No Floating Point Exceptions have been reported - 90,316,893 cycles:u # 2.613 GHz (76.96%) - 123,588 stalled-cycles-frontend:u # 0.14% frontend cycles idle (76.90%) - 11,871,527 stalled-cycles-backend:u # 13.14% backend cycles idle (76.90%) - 231,305,307 instructions:u # 2.56 insn per cycle - # 0.05 stalled cycles per insn (76.90%) - 0.038968170 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8240) (512y: 0) (512z: 0) + 109,022,775 cycles # 2.645 GHz + 240,308,972 instructions # 2.20 insn per cycle + 0.041904895 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7348) (512y: 150) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.170349e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175260e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175260e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.050252 sec +INFO: No Floating Point Exceptions have been reported + 96,595,554 cycles # 1.802 GHz + 138,452,128 instructions # 1.43 insn per cycle + 0.054148545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1692) (512y: 126) (512z: 6592) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index e73b1c13cb..ca894b0a6d 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_16:43:50 +DATE: 2024-05-16_15:19:42 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.397596e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.037256e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.048842e+03 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.373237 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.237277e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.263102e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267367e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.467317 sec INFO: No Floating Point Exceptions have been reported - 996,488,380 cycles:u # 2.539 GHz (73.77%) - 2,078,558 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.45%) - 5,393,212 stalled-cycles-backend:u # 0.54% backend cycles idle (75.57%) - 1,406,194,147 instructions:u # 1.41 insn per cycle - # 0.00 stalled cycles per insn (75.46%) - 0.418570908 seconds time elapsed + 1,933,877,717 cycles # 2.813 GHz + 2,829,779,417 instructions # 1.46 insn per cycle + 0.746133258 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.636419e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.866260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.868188e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.398664 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.945887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.087010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.096853e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.483889 sec INFO: No Floating Point Exceptions have been reported - 1,054,749,544 cycles:u # 2.525 GHz (75.23%) - 2,235,164 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.15%) - 5,745,937 stalled-cycles-backend:u # 0.54% backend cycles idle (75.06%) - 1,536,892,171 instructions:u # 1.46 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 0.444437629 seconds time elapsed + 2,005,783,112 cycles # 2.816 GHz + 2,927,359,248 instructions # 1.46 insn per cycle + 0.768925329 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562860176587E-006 -Relative difference = 3.3392753387325367e-07 +Avg ME (F77/GPU) = 8.1274562860176604E-006 +Relative difference = 3.3392753366481633e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.046938e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.050659e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.050659e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.135474 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.344408e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347652e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347652e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.162339 sec INFO: No Floating Point Exceptions have been reported - 467,381,959 cycles:u # 3.358 GHz (71.77%) - 243,713 stalled-cycles-frontend:u # 0.05% frontend cycles idle (72.79%) - 63,791,087 stalled-cycles-backend:u # 13.65% backend cycles idle (75.68%) - 1,382,666,807 instructions:u # 2.96 insn per cycle - # 0.05 stalled cycles per insn (77.02%) - 0.143497691 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1902) (avx2: 0) (512y: 0) (512z: 0) + 471,806,818 cycles # 2.848 GHz + 1,391,948,601 instructions # 2.95 insn per cycle + 0.166295977 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3869) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 -Avg ME (F77/C++) = 8.1274562860167168E-006 -Relative difference = 3.3392764976441195e-07 +Avg ME (F77/C++) = 8.1274562860167185E-006 +Relative difference = 3.339276495559746e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.203739e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.221862e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.221862e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.062385 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.367799e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.379601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.379601e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.087176 sec INFO: No Floating Point Exceptions have been reported - 229,383,102 cycles:u # 3.477 GHz (72.84%) - 60,907 stalled-cycles-frontend:u # 0.03% frontend cycles idle (76.09%) - 26,884,644 stalled-cycles-backend:u # 11.72% backend cycles idle (75.77%) - 657,622,182 instructions:u # 2.87 insn per cycle - # 0.04 stalled cycles per insn (75.77%) - 0.069942425 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9325) (avx2: 0) (512y: 0) (512z: 0) + 243,999,829 cycles # 2.694 GHz + 695,186,413 instructions # 2.85 insn per cycle + 0.091139423 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9537) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860167168E-006 Relative difference = 3.3392764976441195e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.064693e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.073925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.073925e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.030580 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.395387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400899e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400899e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.042187 sec +INFO: No Floating Point Exceptions have been reported + 119,801,052 cycles # 2.624 GHz + 255,741,591 instructions # 2.13 insn per cycle + 0.046174431 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8181) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.613988e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.621406e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621406e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.037041 sec INFO: No Floating Point Exceptions have been reported - 119,094,038 cycles:u # 3.487 GHz (70.73%) - 108,700 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.15%) - 12,753,984 stalled-cycles-backend:u # 10.71% backend cycles idle (76.62%) - 227,488,187 instructions:u # 1.91 insn per cycle - # 0.06 stalled cycles per insn (76.62%) - 0.039136656 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8196) (512y: 0) (512z: 0) + 106,534,081 cycles # 2.639 GHz + 235,917,118 instructions # 2.21 insn per cycle + 0.041041363 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7301) (512y: 150) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562860174791E-006 Relative difference = 3.3392755596761116e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.167962e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.172897e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172897e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.049548 sec +INFO: No Floating Point Exceptions have been reported + 94,554,513 cycles # 1.786 GHz + 133,899,064 instructions # 1.42 insn per cycle + 0.053428613 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1641) (512y: 126) (512z: 6597) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274562860174791E-006 +Relative difference = 3.3392755596761116e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index 893d812320..f86e27869e 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_17:28:05 +DATE: 2024-05-16_15:19:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.560113e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781566e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.784057e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.100300e-04 +- 2.256635e-04 ) GeV^-4 -TOTAL : 0.704020 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.541598e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.553658e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.556693e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 +TOTAL : 0.467629 sec INFO: No Floating Point Exceptions have been reported - 880,874,724 cycles:u # 0.979 GHz (74.04%) - 2,147,177 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.75%) - 5,547,190 stalled-cycles-backend:u # 0.63% backend cycles idle (74.61%) - 1,325,567,626 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (76.31%) - 1.282830870 seconds time elapsed + 1,964,166,954 cycles # 2.815 GHz + 2,823,406,286 instructions # 1.44 insn per cycle + 0.754117473 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.245493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.054672e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.062488e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.017784e-02 +- 5.681015e-02 ) GeV^-4 -TOTAL : 0.343683 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.614317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.731134e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.742615e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.020493e-03 +- 4.025604e-03 ) GeV^-4 +TOTAL : 0.468434 sec INFO: No Floating Point Exceptions have been reported - 894,309,134 cycles:u # 2.418 GHz (74.08%) - 2,093,905 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.07%) - 5,318,017 stalled-cycles-backend:u # 0.59% backend cycles idle (74.98%) - 1,318,810,937 instructions:u # 1.47 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.392355820 seconds time elapsed + 1,946,164,211 cycles # 2.817 GHz + 2,847,399,547 instructions # 1.46 insn per cycle + 0.748191861 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 8.127320e-06 -Avg ME (F77/GPU) = 8.1275379236374627E-006 -Relative difference = 2.681371441780168e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 8.127250e-06 +Avg ME (F77/GPU) = 8.1272870954487585E-006 +Relative difference = 4.564329725014175e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.487502e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.491674e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.491674e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 -TOTAL : 0.130349 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.448019e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.451516e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.451516e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.158474 sec INFO: No Floating Point Exceptions have been reported - 425,670,365 cycles:u # 3.266 GHz (74.18%) - 93,417 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.47%) - 52,087,766 stalled-cycles-backend:u # 12.24% backend cycles idle (75.47%) - 1,281,047,178 instructions:u # 3.01 insn per cycle - # 0.04 stalled cycles per insn (75.47%) - 0.138010594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1611) (avx2: 0) (512y: 0) (512z: 0) + 461,638,972 cycles # 2.852 GHz + 1,393,493,000 instructions # 3.02 insn per cycle + 0.162490485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3070) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127810e-06 -Avg ME (F77/C++) = 8.1278100323291073E-006 -Relative difference = 3.977591502689147e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127811e-06 +Avg ME (F77/C++) = 8.1278105211728276E-006 +Relative difference = 5.891219330978222e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.744919e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.751382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.751382e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 -TOTAL : 0.035846 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.201120e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.205395e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205395e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.048902 sec INFO: No Floating Point Exceptions have been reported - 116,936,791 cycles:u # 2.993 GHz (71.07%) - 85,599 stalled-cycles-frontend:u # 0.07% frontend cycles idle (79.55%) - 17,084,457 stalled-cycles-backend:u # 14.61% backend cycles idle (79.55%) - 343,234,317 instructions:u # 2.94 insn per cycle - # 0.05 stalled cycles per insn (79.55%) - 0.042363924 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9799) (avx2: 0) (512y: 0) (512z: 0) + 138,099,810 cycles # 2.644 GHz + 375,723,801 instructions # 2.72 insn per cycle + 0.052805368 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127807e-06 -Avg ME (F77/C++) = 8.1278071680283782E-006 -Relative difference = 2.0673273707686565e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127809e-06 +Avg ME (F77/C++) = 8.1278090510674588E-006 +Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.078259e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.113101e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.113101e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 -TOTAL : 0.018458 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.699468e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.721720e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.721720e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.024500 sec +INFO: No Floating Point Exceptions have been reported + 72,431,086 cycles # 2.595 GHz + 146,734,646 instructions # 2.03 insn per cycle + 0.028413255 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8933) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.950281e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979563e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979563e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.023199 sec INFO: No Floating Point Exceptions have been reported - 51,426,519 cycles:u # 2.372 GHz (54.23%) - 85,158 stalled-cycles-frontend:u # 0.17% frontend cycles idle (63.18%) - 5,142,747 stalled-cycles-backend:u # 10.00% backend cycles idle (63.06%) - 161,517,247 instructions:u # 3.14 insn per cycle - # 0.03 stalled cycles per insn (65.21%) - 0.024774594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8980) (512y: 0) (512z: 0) + 67,511,576 cycles # 2.517 GHz + 136,466,222 instructions # 2.02 insn per cycle + 0.027372188 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8164) (512y: 28) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127535e-06 -Avg ME (F77/C++) = 8.1275352476332691E-006 -Relative difference = 3.04684346075092e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.260359e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.280493e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280493e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 +TOTAL : 0.028479 sec +INFO: No Floating Point Exceptions have been reported + 59,124,236 cycles # 1.860 GHz + 85,286,285 instructions # 1.44 insn per cycle + 0.032355670 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2572) (512y: 32) (512z: 6935) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275369863475849E-006 +Relative difference = 1.6797726498700304e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index a9e0aa7328..2af7dd76f9 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_17:35:17 +DATE: 2024-05-16_15:20:03 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.506626e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.731000e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.733224e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.100300e-04 +- 2.256635e-04 ) GeV^-4 -TOTAL : 0.662830 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.561126e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.572400e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.575387e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 +TOTAL : 0.469592 sec INFO: No Floating Point Exceptions have been reported - 841,102,010 cycles:u # 0.952 GHz (74.79%) - 2,443,961 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.15%) - 9,715,686 stalled-cycles-backend:u # 1.16% backend cycles idle (74.73%) - 1,393,812,582 instructions:u # 1.66 insn per cycle - # 0.01 stalled cycles per insn (74.73%) - 1.431059788 seconds time elapsed + 1,933,901,131 cycles # 2.816 GHz + 2,803,636,036 instructions # 1.45 insn per cycle + 0.744726293 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.112275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.921648e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.928652e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.017784e-02 +- 5.681015e-02 ) GeV^-4 -TOTAL : 0.341706 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.901730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003706e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.005157e+06 ) sec^-1 +MeanMatrixElemValue = ( 8.020495e-03 +- 4.025606e-03 ) GeV^-4 +TOTAL : 0.471774 sec INFO: No Floating Point Exceptions have been reported - 887,566,745 cycles:u # 2.451 GHz (73.80%) - 2,141,582 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.53%) - 5,143,326 stalled-cycles-backend:u # 0.58% backend cycles idle (75.73%) - 1,324,273,556 instructions:u # 1.49 insn per cycle - # 0.00 stalled cycles per insn (76.07%) - 0.387872508 seconds time elapsed + 1,934,886,385 cycles # 2.815 GHz + 2,830,776,229 instructions # 1.46 insn per cycle + 0.746474254 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 8.127320e-06 -Avg ME (F77/GPU) = 8.1275379236391975E-006 -Relative difference = 2.681371463124516e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 8.127250e-06 +Avg ME (F77/GPU) = 8.1272870252982758E-006 +Relative difference = 4.555698209723637e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.488556e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.492713e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.492713e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.274747e-01 +- 1.272814e-01 ) GeV^-4 -TOTAL : 0.129216 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.452227e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.455705e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.455705e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.157329 sec INFO: No Floating Point Exceptions have been reported - 407,181,652 cycles:u # 3.136 GHz (75.38%) - 88,691 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.38%) - 54,009,549 stalled-cycles-backend:u # 13.26% backend cycles idle (75.38%) - 1,287,120,920 instructions:u # 3.16 insn per cycle - # 0.04 stalled cycles per insn (75.36%) - 0.137386430 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1591) (avx2: 0) (512y: 0) (512z: 0) + 458,573,657 cycles # 2.854 GHz + 1,388,574,447 instructions # 3.03 insn per cycle + 0.161242660 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127810e-06 -Avg ME (F77/C++) = 8.1278100323291073E-006 -Relative difference = 3.977591502689147e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127811e-06 +Avg ME (F77/C++) = 8.1278105211728276E-006 +Relative difference = 5.891219330978222e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.738379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.744914e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.744914e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.274746e-01 +- 1.272813e-01 ) GeV^-4 -TOTAL : 0.035335 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.204538e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208976e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.208976e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 +TOTAL : 0.047932 sec INFO: No Floating Point Exceptions have been reported - 133,748,688 cycles:u # 3.469 GHz (63.03%) - 11,172 stalled-cycles-frontend:u # 0.01% frontend cycles idle (68.73%) - 17,933,333 stalled-cycles-backend:u # 13.41% backend cycles idle (79.28%) - 337,611,906 instructions:u # 2.52 insn per cycle - # 0.05 stalled cycles per insn (79.28%) - 0.041878350 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9782) (avx2: 0) (512y: 0) (512z: 0) + 136,097,535 cycles # 2.652 GHz + 371,027,952 instructions # 2.73 insn per cycle + 0.051946079 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127807e-06 -Avg ME (F77/C++) = 8.1278071680283782E-006 -Relative difference = 2.0673273707686565e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127809e-06 +Avg ME (F77/C++) = 8.1278090510674588E-006 +Relative difference = 6.2830535070193674e-09 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.065045e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.100324e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.100324e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.275185e-01 +- 1.273251e-01 ) GeV^-4 -TOTAL : 0.017937 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.559391e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.580217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.580217e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.024960 sec +INFO: No Floating Point Exceptions have been reported + 71,167,021 cycles # 2.517 GHz + 142,031,155 instructions # 2.00 insn per cycle + 0.028974311 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8887) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.102195e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.131341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.131341e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 +TOTAL : 0.021142 sec INFO: No Floating Point Exceptions have been reported - 37,451,256 cycles:u # 1.775 GHz (62.17%) - 86,031 stalled-cycles-frontend:u # 0.23% frontend cycles idle (62.04%) - 8,141,981 stalled-cycles-backend:u # 21.74% backend cycles idle (61.89%) - 159,221,307 instructions:u # 4.25 insn per cycle - # 0.05 stalled cycles per insn (80.94%) - 0.024327932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8934) (512y: 0) (512z: 0) + 63,906,261 cycles # 2.611 GHz + 131,729,034 instructions # 2.06 insn per cycle + 0.025029577 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8117) (512y: 28) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 8.127535e-06 -Avg ME (F77/C++) = 8.1275352476332691E-006 -Relative difference = 3.04684346075092e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275366216540664E-006 +Relative difference = 4.655111786058001e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.321655e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.342179e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342179e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 +TOTAL : 0.027241 sec +INFO: No Floating Point Exceptions have been reported + 57,621,926 cycles # 1.879 GHz + 80,488,160 instructions # 1.40 insn per cycle + 0.031258526 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 32) (512z: 6939) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127537e-06 +Avg ME (F77/C++) = 8.1275369863475849E-006 +Relative difference = 1.6797726498700304e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index 2529d656d6..16ac12981a 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_16:44:04 +DATE: 2024-05-16_15:20:13 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.185953e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.865624e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.868687e+03 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.375613 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.172533e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.195464e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.199217e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.467059 sec INFO: No Floating Point Exceptions have been reported - 996,156,178 cycles:u # 2.505 GHz (74.91%) - 2,179,510 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.89%) - 5,885,621 stalled-cycles-backend:u # 0.59% backend cycles idle (75.91%) - 1,405,535,472 instructions:u # 1.41 insn per cycle - # 0.00 stalled cycles per insn (75.88%) - 0.423052294 seconds time elapsed + 1,929,783,722 cycles # 2.812 GHz + 2,830,067,082 instructions # 1.47 insn per cycle + 0.744348567 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.667341e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.862376e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.864834e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.400383 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.817494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.954472e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.963776e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.485210 sec INFO: No Floating Point Exceptions have been reported - 1,097,760,192 cycles:u # 2.596 GHz (74.90%) - 2,197,550 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.43%) - 5,522,644 stalled-cycles-backend:u # 0.50% backend cycles idle (75.43%) - 1,523,723,948 instructions:u # 1.39 insn per cycle - # 0.00 stalled cycles per insn (75.13%) - 0.445687936 seconds time elapsed + 1,989,265,248 cycles # 2.816 GHz + 2,972,405,087 instructions # 1.49 insn per cycle + 0.764721680 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405183E-006 -Relative difference = 3.336909458255062e-07 +Avg ME (F77/GPU) = 8.1274562879405200E-006 +Relative difference = 3.3369094561706885e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.077506e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.080985e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.080985e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.135125 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.312127e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.315249e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.315249e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.164695 sec INFO: No Floating Point Exceptions have been reported - 471,766,950 cycles:u # 3.409 GHz (73.17%) - 109,243 stalled-cycles-frontend:u # 0.02% frontend cycles idle (76.08%) - 67,422,540 stalled-cycles-backend:u # 14.29% backend cycles idle (76.89%) - 1,398,845,419 instructions:u # 2.97 insn per cycle - # 0.05 stalled cycles per insn (76.89%) - 0.141858850 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1922) (avx2: 0) (512y: 0) (512z: 0) + 479,517,658 cycles # 2.854 GHz + 1,405,303,424 instructions # 2.93 insn per cycle + 0.168655160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3977) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.209901e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.228330e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.228330e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.063049 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.589174e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.601629e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.601629e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.085009 sec INFO: No Floating Point Exceptions have been reported - 225,759,845 cycles:u # 3.383 GHz (72.64%) - 69,036 stalled-cycles-frontend:u # 0.03% frontend cycles idle (76.15%) - 22,005,399 stalled-cycles-backend:u # 9.75% backend cycles idle (76.05%) - 653,313,185 instructions:u # 2.89 insn per cycle - # 0.03 stalled cycles per insn (76.05%) - 0.069812764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9115) (avx2: 0) (512y: 0) (512z: 0) + 242,672,694 cycles # 2.748 GHz + 691,102,866 instructions # 2.85 insn per cycle + 0.088915527 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9324) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.099127e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.108206e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.108206e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.030729 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.402863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409241e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409241e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.042757 sec +INFO: No Floating Point Exceptions have been reported + 119,836,607 cycles # 2.596 GHz + 257,882,084 instructions # 2.15 insn per cycle + 0.046733316 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8244) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.611690e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.620124e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.620124e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.037856 sec INFO: No Floating Point Exceptions have been reported - 100,785,402 cycles:u # 2.976 GHz (70.80%) - 100,573 stalled-cycles-frontend:u # 0.10% frontend cycles idle (76.41%) - 11,158,072 stalled-cycles-backend:u # 11.07% backend cycles idle (76.41%) - 230,048,069 instructions:u # 2.28 insn per cycle - # 0.05 stalled cycles per insn (76.41%) - 0.037203826 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8195) (512y: 0) (512z: 0) + 108,462,768 cycles # 2.631 GHz + 238,127,423 instructions # 2.20 insn per cycle + 0.041890123 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7342) (512y: 146) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.150674e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155466e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155466e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.051477 sec +INFO: No Floating Point Exceptions have been reported + 99,538,839 cycles # 1.810 GHz + 139,339,349 instructions # 1.40 insn per cycle + 0.055665824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1953) (512y: 122) (512z: 6323) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index a0ef7ed92e..96180e8a09 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -1,193 +1,233 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-05-15_16:44:10 +DATE: 2024-05-16_15:20:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.171905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.718059e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.723486e+03 ) sec^-1 -MeanMatrixElemValue = ( 5.989810e-05 +- 3.867612e-05 ) GeV^-4 -TOTAL : 0.372303 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.207087e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230616e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.234507e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.468179 sec INFO: No Floating Point Exceptions have been reported - 1,032,364,763 cycles:u # 2.623 GHz (73.57%) - 2,107,596 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.49%) - 5,375,214 stalled-cycles-backend:u # 0.52% backend cycles idle (75.64%) - 1,465,343,007 instructions:u # 1.42 insn per cycle - # 0.00 stalled cycles per insn (75.65%) - 0.416772045 seconds time elapsed + 1,938,727,271 cycles # 2.813 GHz + 2,835,562,501 instructions # 1.46 insn per cycle + 0.747262841 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.675908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.870343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.872814e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.402315e-01 +- 3.184905e-01 ) GeV^-4 -TOTAL : 0.396176 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.924846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.065621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.075056e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 +TOTAL : 0.482793 sec INFO: No Floating Point Exceptions have been reported - 1,063,363,462 cycles:u # 2.553 GHz (75.05%) - 2,257,964 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.21%) - 5,834,320 stalled-cycles-backend:u # 0.55% backend cycles idle (75.36%) - 1,534,492,292 instructions:u # 1.44 insn per cycle - # 0.00 stalled cycles per insn (75.16%) - 0.437685435 seconds time elapsed + 2,011,507,022 cycles # 2.818 GHz + 2,962,288,052 instructions # 1.47 insn per cycle + 0.770325801 seconds time elapsed ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127459e-06 -Avg ME (F77/GPU) = 8.1274562879405183E-006 -Relative difference = 3.336909458255062e-07 +Avg ME (F77/GPU) = 8.1274562879405200E-006 +Relative difference = 3.3369094561706885e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.041187e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.044595e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.044595e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.135649 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.325014e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.328184e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.328184e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.163699 sec INFO: No Floating Point Exceptions have been reported - 472,699,488 cycles:u # 3.402 GHz (71.30%) - 418,182 stalled-cycles-frontend:u # 0.09% frontend cycles idle (73.66%) - 55,247,137 stalled-cycles-backend:u # 11.69% backend cycles idle (76.54%) - 1,395,574,098 instructions:u # 2.95 insn per cycle - # 0.04 stalled cycles per insn (76.98%) - 0.142378440 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1898) (avx2: 0) (512y: 0) (512z: 0) + 475,740,171 cycles # 2.851 GHz + 1,400,755,519 instructions # 2.94 insn per cycle + 0.167716370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3871) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274562948736117E-006 Relative difference = 3.32837900190667e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 9.298475e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.316543e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.316543e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.061721 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.586616e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.599028e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.599028e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.084604 sec INFO: No Floating Point Exceptions have been reported - 209,423,840 cycles:u # 3.228 GHz (72.60%) - 182,070 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.36%) - 23,040,744 stalled-cycles-backend:u # 11.00% backend cycles idle (75.36%) - 649,629,538 instructions:u # 3.10 insn per cycle - # 0.04 stalled cycles per insn (75.36%) - 0.068157982 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 9168) (avx2: 0) (512y: 0) (512z: 0) + 242,310,895 cycles # 2.753 GHz + 687,440,781 instructions # 2.84 insn per cycle + 0.088664129 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563175290919E-006 Relative difference = 3.3005037703909805e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.122127e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.131047e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.131047e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.266821e-01 +- 1.264895e-01 ) GeV^-4 -TOTAL : 0.029826 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.421509e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427219e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427219e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.041396 sec +INFO: No Floating Point Exceptions have been reported + 117,633,598 cycles # 2.630 GHz + 253,582,281 instructions # 2.16 insn per cycle + 0.045344869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8196) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.533249e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.540083e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.540083e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.038782 sec INFO: No Floating Point Exceptions have been reported - 105,559,399 cycles:u # 3.200 GHz (74.06%) - 71,240 stalled-cycles-frontend:u # 0.07% frontend cycles idle (76.17%) - 11,419,080 stalled-cycles-backend:u # 10.82% backend cycles idle (75.79%) - 225,155,426 instructions:u # 2.13 insn per cycle - # 0.05 stalled cycles per insn (75.79%) - 0.036510289 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 8148) (512y: 0) (512z: 0) + 106,121,372 cycles # 2.518 GHz + 233,883,831 instructions # 2.20 insn per cycle + 0.042791740 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7292) (512y: 146) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127459e-06 Avg ME (F77/C++) = 8.1274563450143301E-006 Relative difference = 3.266686019634872e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.148151e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152898e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152898e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 +TOTAL : 0.050273 sec +INFO: No Floating Point Exceptions have been reported + 95,562,086 cycles # 1.781 GHz + 134,760,547 instructions # 1.41 insn per cycle + 0.054201969 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 122) (512z: 6323) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 8.127459e-06 +Avg ME (F77/C++) = 8.1274563450143301E-006 +Relative difference = 3.266686019634872e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index c7f800787e..15f8e8659d 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:02 +DATE: 2024-05-16_15:18:21 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.476371e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.343902e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.771863e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.239474e-01 +- 1.231432e-04 ) GeV^0 -TOTAL : 0.350571 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.830621e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.798641e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.407520e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.518896 sec INFO: No Floating Point Exceptions have been reported - 786,304,797 cycles:u # 2.153 GHz (73.92%) - 2,116,401 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.89%) - 5,713,268 stalled-cycles-backend:u # 0.73% backend cycles idle (75.95%) - 1,300,001,206 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (76.12%) - 0.405219761 seconds time elapsed + 2,130,015,467 cycles # 2.824 GHz + 3,049,782,764 instructions # 1.43 insn per cycle + 0.811167083 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341935 -Relative difference = 2.0349321196791385e-07 +Avg ME (F77/GPU) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.173463e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.328976e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.328976e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 1.083150 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.652167e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115593e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115593e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.200987 sec INFO: No Floating Point Exceptions have been reported - 3,499,775,483 cycles:u # 3.190 GHz (74.72%) - 8,159,223 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.54%) - 9,448,324 stalled-cycles-backend:u # 0.27% backend cycles idle (74.88%) - 8,541,535,095 instructions:u # 2.44 insn per cycle - # 0.00 stalled cycles per insn (75.21%) - 1.100930856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 422) (avx2: 0) (512y: 0) (512z: 0) + 3,451,141,340 cycles # 2.863 GHz + 8,714,346,508 instructions # 2.53 insn per cycle + 1.206502072 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341951 Relative difference = 2.0349321157448718e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.166111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.820275e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.820275e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.673909 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.615216e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.136998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.136998e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.764589 sec INFO: No Floating Point Exceptions have been reported - 2,076,471,270 cycles:u # 3.022 GHz (74.43%) - 8,975,736 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.43%) - 13,351,309 stalled-cycles-backend:u # 0.64% backend cycles idle (74.99%) - 5,324,487,259 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (75.56%) - 0.690647397 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1199) (avx2: 0) (512y: 0) (512z: 0) + 2,197,801,743 cycles # 2.856 GHz + 5,465,338,789 instructions # 2.49 insn per cycle + 0.770190206 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341951 Relative difference = 2.0349321157448718e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.504732e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.208368e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.208368e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.488940 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.276018e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.408168e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.408168e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.576218 sec INFO: No Floating Point Exceptions have been reported - 1,428,607,215 cycles:u # 2.842 GHz (73.80%) - 8,481,300 stalled-cycles-frontend:u # 0.59% frontend cycles idle (73.79%) - 8,503,092 stalled-cycles-backend:u # 0.60% backend cycles idle (74.67%) - 3,090,620,907 instructions:u # 2.16 insn per cycle - # 0.00 stalled cycles per insn (75.47%) - 0.506051388 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1429) (512y: 0) (512z: 0) + 1,593,709,911 cycles # 2.743 GHz + 3,182,241,147 instructions # 2.00 insn per cycle + 0.581747530 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341946 Relative difference = 2.034932117056294e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.349428e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.560869e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.560869e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.561533 sec +INFO: No Floating Point Exceptions have been reported + 1,552,006,209 cycles # 2.741 GHz + 3,083,871,547 instructions # 1.99 insn per cycle + 0.567100846 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1274) (512y: 95) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.103380e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012957e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012957e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.614313 sec +INFO: No Floating Point Exceptions have been reported + 1,344,567,311 cycles # 2.171 GHz + 2,376,857,450 instructions # 1.77 insn per cycle + 0.619905839 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 584) (512y: 62) (512z: 953) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index ac566b37a5..6add239f16 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:09 +DATE: 2024-05-16_15:18:33 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.890074e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.965131e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.585286e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.239474e-01 +- 1.231432e-04 ) GeV^0 -TOTAL : 0.337549 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.948407e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.328423e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.761410e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.519601 sec INFO: No Floating Point Exceptions have been reported - 761,489,259 cycles:u # 2.100 GHz (74.96%) - 2,205,568 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.85%) - 6,065,189 stalled-cycles-backend:u # 0.80% backend cycles idle (74.84%) - 1,348,029,695 instructions:u # 1.77 insn per cycle - # 0.00 stalled cycles per insn (75.93%) - 0.389753644 seconds time elapsed + 2,123,926,879 cycles # 2.815 GHz + 2,991,717,095 instructions # 1.41 insn per cycle + 0.811782941 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341935 -Relative difference = 2.0349321196791385e-07 +Avg ME (F77/GPU) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.182416e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.338716e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.338716e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 1.075977 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.686449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.122021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.122021e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.196252 sec INFO: No Floating Point Exceptions have been reported - 3,463,923,038 cycles:u # 3.180 GHz (74.92%) - 8,991,710 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.03%) - 17,487,568 stalled-cycles-backend:u # 0.50% backend cycles idle (75.03%) - 8,553,903,431 instructions:u # 2.47 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 1.094262453 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 356) (avx2: 0) (512y: 0) (512z: 0) + 3,435,810,217 cycles # 2.862 GHz + 8,629,255,980 instructions # 2.51 insn per cycle + 1.201785163 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 403) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341951 Relative difference = 2.0349321157448718e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.162784e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819432e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819432e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.673987 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.590372e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.090308e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.090308e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.773787 sec INFO: No Floating Point Exceptions have been reported - 2,083,329,582 cycles:u # 3.034 GHz (74.44%) - 9,077,629 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.40%) - 18,145,043 stalled-cycles-backend:u # 0.87% backend cycles idle (74.94%) - 5,268,672,147 instructions:u # 2.53 insn per cycle - # 0.00 stalled cycles per insn (75.52%) - 0.690232746 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1165) (avx2: 0) (512y: 0) (512z: 0) + 2,172,281,754 cycles # 2.790 GHz + 5,399,686,889 instructions # 2.49 insn per cycle + 0.779398624 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1258) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341951 Relative difference = 2.0349321157448718e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.496308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.186579e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.186579e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.488787 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.283822e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.420214e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.420214e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.573489 sec INFO: No Floating Point Exceptions have been reported - 1,401,698,846 cycles:u # 2.796 GHz (74.69%) - 8,215,754 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.47%) - 16,938,722 stalled-cycles-backend:u # 1.21% backend cycles idle (74.52%) - 3,139,297,780 instructions:u # 2.24 insn per cycle - # 0.01 stalled cycles per insn (74.52%) - 0.504560606 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1373) (512y: 0) (512z: 0) + 1,585,769,603 cycles # 2.741 GHz + 3,149,146,191 instructions # 1.99 insn per cycle + 0.579182812 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1386) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328961386341946 Relative difference = 2.034932117056294e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.354137e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.604902e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.604902e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.559958 sec +INFO: No Floating Point Exceptions have been reported + 1,547,131,577 cycles # 2.739 GHz + 3,062,437,995 instructions # 1.98 insn per cycle + 0.565482274 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1220) (512y: 95) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.108481e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.023241e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023241e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.612188 sec +INFO: No Floating Point Exceptions have been reported + 1,354,565,413 cycles # 2.195 GHz + 2,362,076,089 instructions # 1.74 insn per cycle + 0.617754113 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 557) (512y: 62) (512z: 944) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328961386341946 +Relative difference = 2.034932117056294e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index 03a0938477..35b822f8f6 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:16 +DATE: 2024-05-16_15:18:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.366550e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323311e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.728963e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.242352e-01 +- 1.228752e-04 ) GeV^0 -TOTAL : 0.326960 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.370205e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.202282e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.219119e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 +TOTAL : 0.481970 sec INFO: No Floating Point Exceptions have been reported - 764,972,013 cycles:u # 2.296 GHz (73.85%) - 2,176,685 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.57%) - 5,473,175 stalled-cycles-backend:u # 0.72% backend cycles idle (74.11%) - 1,374,260,207 instructions:u # 1.80 insn per cycle - # 0.00 stalled cycles per insn (74.56%) - 0.375723598 seconds time elapsed + 1,992,725,828 cycles # 2.818 GHz + 2,868,294,521 instructions # 1.44 insn per cycle + 0.764321619 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 72 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 4.232895e-01 -Avg ME (F77/GPU) = 0.42328966126660816 -Relative difference = 3.80984192091939e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.232893e-01 +Avg ME (F77/GPU) = 0.42328959883889183 +Relative difference = 7.059920764700599e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.312229e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504727e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504727e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228718e-04 ) GeV^0 -TOTAL : 0.947967 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.685625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.126627e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126627e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.173857 sec INFO: No Floating Point Exceptions have been reported - 3,116,480,134 cycles:u # 3.260 GHz (75.07%) - 6,210,591 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.91%) - 7,102,976 stalled-cycles-backend:u # 0.23% backend cycles idle (74.90%) - 8,524,848,265 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (74.89%) - 0.959379324 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 516) (avx2: 0) (512y: 0) (512z: 0) + 3,371,653,633 cycles # 2.862 GHz + 8,663,374,999 instructions # 2.57 insn per cycle + 1.179087797 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 464) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 Avg ME (F77/C++) = 0.42328961598104797 Relative difference = 3.775440734888737e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.375040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.102188e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.102188e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228718e-04 ) GeV^0 -TOTAL : 0.460604 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.242831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.476100e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.476100e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.559869 sec INFO: No Floating Point Exceptions have been reported - 1,394,080,880 cycles:u # 2.969 GHz (74.96%) - 7,003,149 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.53%) - 6,087,454 stalled-cycles-backend:u # 0.44% backend cycles idle (74.45%) - 3,710,982,581 instructions:u # 2.66 insn per cycle - # 0.00 stalled cycles per insn (74.50%) - 0.473592012 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1393) (avx2: 0) (512y: 0) (512z: 0) + 1,544,628,517 cycles # 2.742 GHz + 3,687,558,281 instructions # 2.39 insn per cycle + 0.565253973 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960620216094 -Relative difference = 1.4652287586288606e-08 +Avg ME (F77/C++) = 0.42328960439772345 +Relative difference = 1.0389396439618597e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.664733e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.284601e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.284601e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228719e-04 ) GeV^0 -TOTAL : 0.373733 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.072720e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.536969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.536969e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.431765 sec +INFO: No Floating Point Exceptions have been reported + 1,203,780,059 cycles # 2.758 GHz + 2,425,738,448 instructions # 2.02 insn per cycle + 0.436956710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1835) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232896e-01 +Avg ME (F77/C++) = 0.42328956670826301 +Relative difference = 7.865002347873079e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.171115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.846212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.846212e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.420853 sec INFO: No Floating Point Exceptions have been reported - 1,096,363,607 cycles:u # 2.870 GHz (75.63%) - 6,601,459 stalled-cycles-frontend:u # 0.60% frontend cycles idle (75.08%) - 22,320,721 stalled-cycles-backend:u # 2.04% backend cycles idle (74.88%) - 2,334,473,718 instructions:u # 2.13 insn per cycle - # 0.01 stalled cycles per insn (74.88%) - 0.386229589 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1807) (512y: 0) (512z: 0) + 1,176,016,394 cycles # 2.764 GHz + 2,371,904,468 instructions # 2.02 insn per cycle + 0.426173333 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956839628518 -Relative difference = 7.466215756732981e-08 +Avg ME (F77/C++) = 0.42328956670826301 +Relative difference = 7.865002347873079e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.877260e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.908000e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.908000e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.456855 sec +INFO: No Floating Point Exceptions have been reported + 1,057,659,631 cycles # 2.291 GHz + 2,045,594,279 instructions # 1.93 insn per cycle + 0.462305299 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1125) (512y: 5) (512z: 1216) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232896e-01 +Avg ME (F77/C++) = 0.42328957567224279 +Relative difference = 5.7473080363015266e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index b13effeb6c..7aff49b16c 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:23 +DATE: 2024-05-16_15:18:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.323825e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319656e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.719868e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.242352e-01 +- 1.228752e-04 ) GeV^0 -TOTAL : 0.308823 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.371360e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210950e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.256375e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 +TOTAL : 0.480672 sec INFO: No Floating Point Exceptions have been reported - 704,306,697 cycles:u # 2.128 GHz (75.02%) - 2,137,974 stalled-cycles-frontend:u # 0.30% frontend cycles idle (75.50%) - 5,624,335 stalled-cycles-backend:u # 0.80% backend cycles idle (75.88%) - 1,274,415,055 instructions:u # 1.81 insn per cycle - # 0.00 stalled cycles per insn (76.11%) - 0.359510282 seconds time elapsed + 1,992,055,315 cycles # 2.814 GHz + 2,833,598,547 instructions # 1.42 insn per cycle + 0.764848194 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 71 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 4.232895e-01 -Avg ME (F77/GPU) = 0.42328966126660816 -Relative difference = 3.80984192091939e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 4.232893e-01 +Avg ME (F77/GPU) = 0.42328960436861962 +Relative difference = 7.190557844040413e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.317115e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.514597e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.514597e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228718e-04 ) GeV^0 -TOTAL : 0.944202 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.763702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.137508e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.137508e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.163446 sec INFO: No Floating Point Exceptions have been reported - 3,067,315,266 cycles:u # 3.220 GHz (74.81%) - 7,060,895 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.80%) - 8,136,724 stalled-cycles-backend:u # 0.27% backend cycles idle (74.83%) - 8,530,487,090 instructions:u # 2.78 insn per cycle - # 0.00 stalled cycles per insn (74.92%) - 0.955712297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 379) (avx2: 0) (512y: 0) (512z: 0) + 3,338,476,373 cycles # 2.858 GHz + 8,537,550,948 instructions # 2.56 insn per cycle + 1.168736395 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 372) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 Avg ME (F77/C++) = 0.42328961598104797 Relative difference = 3.775440734888737e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.371365e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089598e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.089598e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228718e-04 ) GeV^0 -TOTAL : 0.459025 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.260122e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497908e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497908e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.555036 sec INFO: No Floating Point Exceptions have been reported - 1,408,227,078 cycles:u # 3.016 GHz (74.30%) - 6,825,925 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.36%) - 9,607,425 stalled-cycles-backend:u # 0.68% backend cycles idle (74.79%) - 3,619,594,432 instructions:u # 2.57 insn per cycle - # 0.00 stalled cycles per insn (75.58%) - 0.471202314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1365) (avx2: 0) (512y: 0) (512z: 0) + 1,536,047,057 cycles # 2.745 GHz + 3,655,155,421 instructions # 2.38 insn per cycle + 0.560267212 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1417) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960620216094 -Relative difference = 1.4652287586288606e-08 +Avg ME (F77/C++) = 0.42328960439772345 +Relative difference = 1.0389396439618597e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.694915e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.325213e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.325213e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241992e-01 +- 1.228719e-04 ) GeV^0 -TOTAL : 0.370157 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.063874e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.501699e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.501699e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.432903 sec +INFO: No Floating Point Exceptions have been reported + 1,210,141,290 cycles # 2.765 GHz + 2,409,755,736 instructions # 1.99 insn per cycle + 0.438252635 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1739) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232896e-01 +Avg ME (F77/C++) = 0.42328956670826301 +Relative difference = 7.865002347873079e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.166764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.861571e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.861571e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.420903 sec INFO: No Floating Point Exceptions have been reported - 1,082,261,478 cycles:u # 2.863 GHz (74.64%) - 7,067,276 stalled-cycles-frontend:u # 0.65% frontend cycles idle (74.61%) - 13,677,966 stalled-cycles-backend:u # 1.26% backend cycles idle (74.60%) - 2,386,959,225 instructions:u # 2.21 insn per cycle - # 0.01 stalled cycles per insn (74.67%) - 0.381523204 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 0) (512z: 0) + 1,178,969,939 cycles # 2.770 GHz + 2,360,225,770 instructions # 2.00 insn per cycle + 0.426183474 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1639) (512y: 2) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956839628518 -Relative difference = 7.466215756732981e-08 +Avg ME (F77/C++) = 0.42328956670826301 +Relative difference = 7.865002347873079e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.911284e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.009343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.009343e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.450059 sec +INFO: No Floating Point Exceptions have been reported + 1,050,992,336 cycles # 2.312 GHz + 2,030,439,704 instructions # 1.93 insn per cycle + 0.455402836 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1038) (512y: 5) (512z: 1206) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232896e-01 +Avg ME (F77/C++) = 0.42328957567224279 +Relative difference = 5.7473080363015266e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 42c2c5fccc..abe970d6c3 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:29 +DATE: 2024-05-16_15:19:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.404899e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.300485e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.791389e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.239474e-01 +- 1.231432e-04 ) GeV^0 -TOTAL : 0.376624 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.820532e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.774843e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.362520e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.522135 sec INFO: No Floating Point Exceptions have been reported - 801,858,601 cycles:u # 2.176 GHz (71.98%) - 2,301,473 stalled-cycles-frontend:u # 0.29% frontend cycles idle (73.67%) - 5,597,215 stalled-cycles-backend:u # 0.70% backend cycles idle (74.62%) - 1,342,545,263 instructions:u # 1.67 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.432204588 seconds time elapsed + 2,125,526,304 cycles # 2.816 GHz + 3,031,609,259 instructions # 1.43 insn per cycle + 0.813775431 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809230 -Relative difference = 2.026789399531628e-07 +Avg ME (F77/GPU) = 0.42328961420809225 +Relative difference = 2.02678940084305e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.159586e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.310118e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.310118e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 1.093976 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.477506e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.093135e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.221347 sec INFO: No Floating Point Exceptions have been reported - 3,499,257,249 cycles:u # 3.163 GHz (74.77%) - 8,141,037 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.72%) - 13,622,727 stalled-cycles-backend:u # 0.39% backend cycles idle (74.72%) - 8,671,149,251 instructions:u # 2.48 insn per cycle - # 0.00 stalled cycles per insn (74.86%) - 1.110653138 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 427) (avx2: 0) (512y: 0) (512z: 0) + 3,505,104,547 cycles # 2.859 GHz + 8,781,502,817 instructions # 2.51 insn per cycle + 1.226777715 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328962565639783 Relative difference = 1.7563291089600324e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.188052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.865997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.865997e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.668768 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.650256e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201424e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201424e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.750812 sec INFO: No Floating Point Exceptions have been reported - 2,047,248,570 cycles:u # 3.006 GHz (74.29%) - 8,783,902 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.69%) - 8,006,425 stalled-cycles-backend:u # 0.39% backend cycles idle (75.28%) - 5,269,378,246 instructions:u # 2.57 insn per cycle - # 0.00 stalled cycles per insn (75.33%) - 0.685132964 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1260) (avx2: 0) (512y: 0) (512z: 0) + 2,158,593,065 cycles # 2.858 GHz + 5,461,970,761 instructions # 2.53 insn per cycle + 0.756427517 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1315) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328962565639783 Relative difference = 1.7563291089600324e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.575206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.372803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.372803e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.482122 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.173052e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.222124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.222124e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.600946 sec +INFO: No Floating Point Exceptions have been reported + 1,584,857,703 cycles # 2.630 GHz + 3,130,453,718 instructions # 1.98 insn per cycle + 0.606559761 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1508) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.444228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788523e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.788523e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.544040 sec INFO: No Floating Point Exceptions have been reported - 1,408,748,553 cycles:u # 2.850 GHz (73.78%) - 8,148,579 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.12%) - 9,562,412 stalled-cycles-backend:u # 0.68% backend cycles idle (74.85%) - 3,008,594,405 instructions:u # 2.14 insn per cycle - # 0.00 stalled cycles per insn (75.66%) - 0.498542187 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) + 1,507,653,377 cycles # 2.746 GHz + 2,979,978,086 instructions # 1.98 insn per cycle + 0.549733637 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1266) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962559055894 -Relative difference = 1.757884518645067e-07 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.159766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.131056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.131056e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.601738 sec +INFO: No Floating Point Exceptions have been reported + 1,324,343,740 cycles # 2.183 GHz + 2,317,585,809 instructions # 1.75 insn per cycle + 0.607328338 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 708) (512y: 64) (512z: 1000) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index 420f0fe548..91c7a883f0 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-05-15_16:43:36 +DATE: 2024-05-16_15:19:20 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.139991e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.972399e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.594504e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.239474e-01 +- 1.231432e-04 ) GeV^0 -TOTAL : 0.333772 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.922874e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.310136e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.745093e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.518290 sec INFO: No Floating Point Exceptions have been reported - 753,695,407 cycles:u # 2.103 GHz (74.79%) - 2,155,969 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.40%) - 5,408,780 stalled-cycles-backend:u # 0.72% backend cycles idle (75.66%) - 1,329,703,636 instructions:u # 1.76 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 0.387999054 seconds time elapsed + 2,124,893,311 cycles # 2.820 GHz + 3,045,592,907 instructions # 1.43 insn per cycle + 0.810370808 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809230 -Relative difference = 2.026789399531628e-07 +Avg ME (F77/GPU) = 0.42328961420809225 +Relative difference = 2.02678940084305e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.167517e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.320669e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.320669e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 1.085247 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.542081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.100861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.100861e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 1.212162 sec INFO: No Floating Point Exceptions have been reported - 3,495,870,402 cycles:u # 3.185 GHz (74.54%) - 8,419,320 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.84%) - 13,814,541 stalled-cycles-backend:u # 0.40% backend cycles idle (75.20%) - 8,553,626,945 instructions:u # 2.45 insn per cycle - # 0.00 stalled cycles per insn (75.22%) - 1.101702386 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 358) (avx2: 0) (512y: 0) (512z: 0) + 3,479,876,909 cycles # 2.860 GHz + 8,693,142,752 instructions # 2.50 insn per cycle + 1.217788949 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328962565639783 Relative difference = 1.7563291089600324e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.208789e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.872230e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.872230e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.662705 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.583309e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.076893e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.076893e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.776846 sec INFO: No Floating Point Exceptions have been reported - 1,998,727,385 cycles:u # 2.962 GHz (75.01%) - 8,096,924 stalled-cycles-frontend:u # 0.41% frontend cycles idle (75.11%) - 15,605,174 stalled-cycles-backend:u # 0.78% backend cycles idle (75.10%) - 5,333,491,692 instructions:u # 2.67 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 0.678888304 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1221) (avx2: 0) (512y: 0) (512z: 0) + 2,167,338,088 cycles # 2.773 GHz + 5,396,551,029 instructions # 2.49 insn per cycle + 0.782321373 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 Avg ME (F77/C++) = 0.42328962565639783 Relative difference = 1.7563291089600324e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 3.583996e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.397080e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.397080e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.241994e-01 +- 1.228720e-04 ) GeV^0 -TOTAL : 0.480459 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.326845e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.550286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.550286e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.565802 sec +INFO: No Floating Point Exceptions have been reported + 1,565,712,129 cycles # 2.743 GHz + 3,096,211,416 instructions # 1.98 insn per cycle + 0.571442008 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1403) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.453432e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.812851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.812851e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.541762 sec INFO: No Floating Point Exceptions have been reported - 1,384,513,109 cycles:u # 2.811 GHz (74.26%) - 8,017,444 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.11%) - 16,159,141 stalled-cycles-backend:u # 1.17% backend cycles idle (75.64%) - 3,004,246,317 instructions:u # 2.17 insn per cycle - # 0.01 stalled cycles per insn (75.64%) - 0.496541853 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1430) (512y: 0) (512z: 0) + 1,501,240,710 cycles # 2.746 GHz + 2,962,583,104 instructions # 1.97 insn per cycle + 0.547343450 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1207) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962559055894 -Relative difference = 1.757884518645067e-07 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.179755e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.168512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.168512e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 +TOTAL : 0.595795 sec +INFO: No Floating Point Exceptions have been reported + 1,328,066,698 cycles # 2.210 GHz + 2,301,968,914 instructions # 1.73 insn per cycle + 0.601517736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 669) (512y: 64) (512z: 987) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 4.232897e-01 +Avg ME (F77/C++) = 0.42328962604218012 +Relative difference = 1.747215201983364e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 302ed78a64..685cbca5b9 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:41:45 +DATE: 2024-05-16_15:16:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.832898e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.951894e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.006384e+07 ) sec^-1 -MeanMatrixElemValue = ( 3.295059e+00 +- 3.224567e-03 ) GeV^0 -TOTAL : 0.382081 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.742150e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168430e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277843e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 0.532609 sec INFO: No Floating Point Exceptions have been reported - 877,304,933 cycles:u # 2.198 GHz (74.42%) - 2,115,937 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.86%) - 5,183,719 stalled-cycles-backend:u # 0.59% backend cycles idle (76.00%) - 1,377,648,981 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 0.435827075 seconds time elapsed + 2,187,320,510 cycles # 2.847 GHz + 3,138,661,758 instructions # 1.43 insn per cycle + 0.825533767 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595181 -Relative difference = 1.298794346312088e-07 +Avg ME (F77/GPU) = 3.2340795799595186 +Relative difference = 1.2987943449389332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.524994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.593507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.593507e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 4.336531 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.052254e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112326e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 5.217611 sec INFO: No Floating Point Exceptions have been reported - 14,730,042,192 cycles:u # 3.388 GHz (74.98%) - 9,568,439 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 1,140,928,798 stalled-cycles-backend:u # 7.75% backend cycles idle (74.99%) - 38,744,640,497 instructions:u # 2.63 insn per cycle - # 0.03 stalled cycles per insn (74.98%) - 4.352523083 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 726) (avx2: 0) (512y: 0) (512z: 0) + 15,171,088,318 cycles # 2.905 GHz + 38,379,828,637 instructions # 2.53 insn per cycle + 5.223033411 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 673) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 Avg ME (F77/C++) = 3.2340795799593964 Relative difference = 1.2987947225564713e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.312999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.524318e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524318e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 2.612223 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.483453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675957e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 3.119586 sec INFO: No Floating Point Exceptions have been reported - 8,782,783,014 cycles:u # 3.347 GHz (75.03%) - 9,188,407 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.00%) - 1,802,679,075 stalled-cycles-backend:u # 20.53% backend cycles idle (75.00%) - 24,413,813,857 instructions:u # 2.78 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 2.628225339 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) + 9,050,575,942 cycles # 2.897 GHz + 24,585,418,505 instructions # 2.72 insn per cycle + 3.125051862 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (F77/C++) = 3.2340795799593955 +Relative difference = 1.2987947253027805e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.576771e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.160866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.160866e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 1.564392 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.531605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.007383e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.007383e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.004395 sec +INFO: No Floating Point Exceptions have been reported + 5,470,487,475 cycles # 2.723 GHz + 11,258,117,341 instructions # 2.06 insn per cycle + 2.009874159 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2379) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.034312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.611178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.611178e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 1.846817 sec INFO: No Floating Point Exceptions have been reported - 5,138,956,908 cycles:u # 3.261 GHz (74.76%) - 8,020,795 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.77%) - 32,368,163 stalled-cycles-backend:u # 0.63% backend cycles idle (75.01%) - 11,470,668,323 instructions:u # 2.23 insn per cycle - # 0.00 stalled cycles per insn (75.13%) - 1.580105792 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2399) (512y: 0) (512z: 0) + 4,937,000,755 cycles # 2.666 GHz + 10,562,656,233 instructions # 2.14 insn per cycle + 1.852346867 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594542 -Relative difference = 1.2987945440463624e-07 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.686069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.892849e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.892849e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.955560 sec +INFO: No Floating Point Exceptions have been reported + 5,363,967,162 cycles # 1.812 GHz + 7,798,816,647 instructions # 1.45 insn per cycle + 2.961128813 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1545) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index 9a93ce363a..e33bd01ef0 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:41:58 +DATE: 2024-05-16_15:16:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.658767e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.925934e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.978977e+07 ) sec^-1 -MeanMatrixElemValue = ( 3.295059e+00 +- 3.224567e-03 ) GeV^0 -TOTAL : 0.393694 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.734270e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167895e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277771e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 0.531030 sec INFO: No Floating Point Exceptions have been reported - 903,828,872 cycles:u # 2.285 GHz (73.39%) - 2,128,293 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.25%) - 5,468,106 stalled-cycles-backend:u # 0.60% backend cycles idle (75.65%) - 1,391,348,192 instructions:u # 1.54 insn per cycle - # 0.00 stalled cycles per insn (75.95%) - 0.447396930 seconds time elapsed + 2,147,766,041 cycles # 2.808 GHz + 3,081,960,346 instructions # 1.43 insn per cycle + 0.823573588 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595181 -Relative difference = 1.298794346312088e-07 +Avg ME (F77/GPU) = 3.2340795799595186 +Relative difference = 1.2987943449389332e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.426481e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.486692e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.486692e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 4.500856 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.072347e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.133952e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.133952e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 5.167480 sec INFO: No Floating Point Exceptions have been reported - 15,390,828,690 cycles:u # 3.411 GHz (75.02%) - 8,990,756 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) - 23,490,713 stalled-cycles-backend:u # 0.15% backend cycles idle (75.01%) - 39,479,847,281 instructions:u # 2.57 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.516979997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 597) (avx2: 0) (512y: 0) (512z: 0) + 15,011,121,904 cycles # 2.902 GHz + 40,101,107,795 instructions # 2.67 insn per cycle + 5.172969591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593969 -Relative difference = 1.2987947211833165e-07 +Avg ME (F77/C++) = 3.2340795799593964 +Relative difference = 1.2987947225564713e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.366565e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.582371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.582371e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 2.580975 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.643871e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.853935e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.853935e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.986462 sec INFO: No Floating Point Exceptions have been reported - 8,665,784,444 cycles:u # 3.343 GHz (75.04%) - 8,737,692 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.01%) - 1,175,684,003 stalled-cycles-backend:u # 13.57% backend cycles idle (75.00%) - 23,537,357,898 instructions:u # 2.72 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 2.596226506 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1947) (avx2: 0) (512y: 0) (512z: 0) + 8,687,902,361 cycles # 2.905 GHz + 23,671,582,038 instructions # 2.72 insn per cycle + 2.991891761 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (F77/C++) = 3.2340795799593955 +Relative difference = 1.2987947253027805e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.889868e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.366562e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.366562e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 1.702727 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.688647e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.031946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.031946e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.347118 sec +INFO: No Floating Point Exceptions have been reported + 6,408,205,490 cycles # 2.726 GHz + 13,061,009,362 instructions # 2.04 insn per cycle + 2.352705794 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2545) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.217515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.639971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.639971e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.116902 sec INFO: No Floating Point Exceptions have been reported - 5,646,738,380 cycles:u # 3.294 GHz (74.83%) - 9,540,611 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.82%) - 298,360,320 stalled-cycles-backend:u # 5.28% backend cycles idle (74.80%) - 13,134,319,700 instructions:u # 2.33 insn per cycle - # 0.02 stalled cycles per insn (75.02%) - 1.718575828 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2559) (512y: 0) (512z: 0) + 5,786,103,959 cycles # 2.728 GHz + 12,322,398,791 instructions # 2.13 insn per cycle + 2.122365893 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594542 -Relative difference = 1.2987945440463624e-07 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.391355e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565589e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565589e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 3.201193 sec +INFO: No Floating Point Exceptions have been reported + 5,819,258,849 cycles # 1.816 GHz + 9,603,315,511 instructions # 1.65 insn per cycle + 3.206783116 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1970) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340795799594546 +Relative difference = 1.2987945426732077e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index ccb760b2ba..fa2404eda0 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:42:12 +DATE: 2024-05-16_15:16:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.668767e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.923134e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.083730e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.286435e+00 +- 3.209475e-03 ) GeV^0 -TOTAL : 0.319988 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.806467e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.679043e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.988694e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 +TOTAL : 0.484472 sec INFO: No Floating Point Exceptions have been reported - 801,327,175 cycles:u # 2.329 GHz (73.65%) - 2,161,896 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.62%) - 4,181,501 stalled-cycles-backend:u # 0.52% backend cycles idle (73.95%) - 1,363,211,462 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 0.374378383 seconds time elapsed + 2,024,107,607 cycles # 2.847 GHz + 2,925,717,340 instructions # 1.45 insn per cycle + 0.767822860 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 3.234089e+00 -Avg ME (F77/GPU) = 3.2340912986546755 -Relative difference = 7.107580142328097e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 3.234085e+00 +Avg ME (F77/GPU) = 3.2341253389604390 +Relative difference = 1.2473067479392238e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.996525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087124e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287818e+00 +- 3.220110e-03 ) GeV^0 -TOTAL : 3.647819 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.190102e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.263149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.263149e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 +TOTAL : 4.875075 sec INFO: No Floating Point Exceptions have been reported - 12,622,398,192 cycles:u # 3.452 GHz (74.87%) - 6,907,551 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.94%) - 764,688,379 stalled-cycles-backend:u # 6.06% backend cycles idle (75.05%) - 36,996,500,373 instructions:u # 2.93 insn per cycle - # 0.02 stalled cycles per insn (75.06%) - 3.661293566 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 607) (avx2: 0) (512y: 0) (512z: 0) + 14,157,231,167 cycles # 2.902 GHz + 38,349,372,496 instructions # 2.71 insn per cycle + 4.880360280 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340939850546420 -Relative difference = 4.621188450363643e-09 +Avg ME (F77/C++) = 3.2340941932052374 +Relative difference = 5.974014286114415e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.339494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.774882e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.774882e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287818e+00 +- 3.220111e-03 ) GeV^0 -TOTAL : 1.800475 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.893708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.295163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.295163e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 +TOTAL : 2.231375 sec INFO: No Floating Point Exceptions have been reported - 6,126,928,844 cycles:u # 3.387 GHz (74.79%) - 7,098,604 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.81%) - 2,006,509,813 stalled-cycles-backend:u # 32.75% backend cycles idle (74.93%) - 15,157,055,875 instructions:u # 2.47 insn per cycle - # 0.13 stalled cycles per insn (75.13%) - 1.812595313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2462) (avx2: 0) (512y: 0) (512z: 0) + 6,474,839,888 cycles # 2.896 GHz + 15,821,273,128 instructions # 2.44 insn per cycle + 2.236825857 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941177681088 -Relative difference = 3.641455970126884e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234093e+00 +Avg ME (F77/C++) = 3.2340934062376618 +Relative difference = 1.2561100182708985e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 1.220743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.377837e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.377837e+06 ) sec^-1 -MeanMatrixElemValue = ( 3.287796e+00 +- 3.219543e-03 ) GeV^0 -TOTAL : 1.004275 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.952001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027533e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027533e+06 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.258720 sec INFO: No Floating Point Exceptions have been reported - 3,313,286,850 cycles:u # 3.273 GHz (74.73%) - 7,508,941 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.71%) - 1,105,690,829 stalled-cycles-backend:u # 33.37% backend cycles idle (74.74%) - 7,720,635,250 instructions:u # 2.33 insn per cycle - # 0.14 stalled cycles per insn (74.94%) - 1.016455975 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3076) (512y: 0) (512z: 0) + 3,454,982,692 cycles # 2.735 GHz + 7,599,041,128 instructions # 2.20 insn per cycle + 1.263980564 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340926420874894 -Relative difference = 1.1066858953654753e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340919882990420 +Relative difference = 3.6180040581126224e-09 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.592851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112843e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112843e+06 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.180051 sec +INFO: No Floating Point Exceptions have been reported + 3,244,154,820 cycles # 2.739 GHz + 7,208,080,032 instructions # 2.22 insn per cycle + 1.185371954 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340919882990420 +Relative difference = 3.6180040581126224e-09 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.861599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.601056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.601056e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.616099 sec +INFO: No Floating Point Exceptions have been reported + 3,061,871,050 cycles # 1.890 GHz + 5,840,738,200 instructions # 1.91 insn per cycle + 1.621459577 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2375) (512y: 24) (512z: 1889) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340921289287508 +Relative difference = 3.986551736519174e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 780d26225f..17580b0829 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:42:23 +DATE: 2024-05-16_15:17:12 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.921251e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.081243e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.260252e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.286435e+00 +- 3.209475e-03 ) GeV^0 -TOTAL : 0.320643 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.907160e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.728602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.048441e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 +TOTAL : 0.485743 sec INFO: No Floating Point Exceptions have been reported - 782,507,868 cycles:u # 2.281 GHz (73.92%) - 2,270,647 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.12%) - 4,716,638 stalled-cycles-backend:u # 0.60% backend cycles idle (74.10%) - 1,294,011,543 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (76.06%) - 0.372039073 seconds time elapsed + 2,023,423,533 cycles # 2.849 GHz + 2,905,255,031 instructions # 1.44 insn per cycle + 0.768600730 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/GPU) = 3.234089e+00 -Avg ME (F77/GPU) = 3.2340912986546755 -Relative difference = 7.107580142328097e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 +Avg ME (C++/GPU) = 3.234085e+00 +Avg ME (F77/GPU) = 3.2341253389604390 +Relative difference = 1.2473067479392238e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.982185e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.071485e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.071485e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287818e+00 +- 3.220110e-03 ) GeV^0 -TOTAL : 3.662693 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.168782e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238544e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 +TOTAL : 4.921731 sec INFO: No Floating Point Exceptions have been reported - 12,648,718,745 cycles:u # 3.446 GHz (74.89%) - 7,351,614 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) - 9,939,996 stalled-cycles-backend:u # 0.08% backend cycles idle (74.84%) - 37,497,463,017 instructions:u # 2.96 insn per cycle - # 0.00 stalled cycles per insn (74.91%) - 3.674792475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 500) (avx2: 0) (512y: 0) (512z: 0) + 14,314,886,956 cycles # 2.906 GHz + 39,834,092,366 instructions # 2.78 insn per cycle + 4.927032591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340939850546420 -Relative difference = 4.621188450363643e-09 +Avg ME (F77/C++) = 3.2340941675938666 +Relative difference = 5.182096339328524e-08 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.348676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.942747e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.942747e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287818e+00 +- 3.220111e-03 ) GeV^0 -TOTAL : 1.570234 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.713515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.269520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.269520e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 +TOTAL : 1.922771 sec INFO: No Floating Point Exceptions have been reported - 5,336,993,251 cycles:u # 3.383 GHz (74.71%) - 7,482,422 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.93%) - 1,350,151,950 stalled-cycles-backend:u # 25.30% backend cycles idle (75.15%) - 15,181,564,216 instructions:u # 2.84 insn per cycle - # 0.09 stalled cycles per insn (75.15%) - 1.581043852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2329) (avx2: 0) (512y: 0) (512z: 0) + 5,581,497,918 cycles # 2.896 GHz + 15,286,085,618 instructions # 2.74 insn per cycle + 1.928038449 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941177681088 -Relative difference = 3.641455970126884e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234093e+00 +Avg ME (F77/C++) = 3.2340934062376618 +Relative difference = 1.2561100182708985e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 8.942098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.758757e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.758757e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287796e+00 +- 3.219543e-03 ) GeV^0 -TOTAL : 1.315652 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.348339e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.987488e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.987488e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.738529 sec INFO: No Floating Point Exceptions have been reported - 4,441,993,205 cycles:u # 3.357 GHz (74.68%) - 8,141,578 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.96%) - 1,679,672,585 stalled-cycles-backend:u # 37.81% backend cycles idle (75.21%) - 9,786,472,986 instructions:u # 2.20 insn per cycle - # 0.17 stalled cycles per insn (75.21%) - 1.327031748 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3749) (512y: 0) (512z: 0) + 4,748,584,350 cycles # 2.724 GHz + 9,734,762,909 instructions # 2.05 insn per cycle + 1.743720825 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3707) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340926462784410 -Relative difference = 1.0937272340475427e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340919817797840 +Relative difference = 5.633796441974414e-09 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.524514e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.201131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.201131e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.693263 sec +INFO: No Floating Point Exceptions have been reported + 4,630,030,488 cycles # 2.727 GHz + 9,326,323,775 instructions # 2.01 insn per cycle + 1.698452247 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3495) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340919817797840 +Relative difference = 5.633796441974414e-09 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.566237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.043529e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.043529e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 +TOTAL : 1.970968 sec +INFO: No Floating Point Exceptions have been reported + 3,659,262,236 cycles # 1.853 GHz + 7,035,706,161 instructions # 1.92 insn per cycle + 1.976219857 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 12) (512z: 2220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234092e+00 +Avg ME (F77/C++) = 3.2340921270661056 +Relative difference = 3.928957668408837e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index 9650f66d4a..b504154b8b 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:42:35 +DATE: 2024-05-16_15:17:33 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.852469e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.026591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.082432e+07 ) sec^-1 -MeanMatrixElemValue = ( 3.295059e+00 +- 3.224567e-03 ) GeV^0 -TOTAL : 0.369753 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.734753e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166290e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275672e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 0.527580 sec INFO: No Floating Point Exceptions have been reported - 910,136,031 cycles:u # 2.309 GHz (74.36%) - 2,197,500 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.30%) - 5,789,107 stalled-cycles-backend:u # 0.64% backend cycles idle (74.11%) - 1,453,168,059 instructions:u # 1.60 insn per cycle - # 0.00 stalled cycles per insn (75.55%) - 0.424603823 seconds time elapsed + 2,184,025,819 cycles # 2.852 GHz + 3,120,664,968 instructions # 1.43 insn per cycle + 0.822365132 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181671 -Relative difference = 1.2865539287460837e-07 +Avg ME (F77/GPU) = 3.2340795839181666 +Relative difference = 1.2865539301192385e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.463673e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.525138e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.525138e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 4.436706 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.032702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.091464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.091464e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 5.267767 sec INFO: No Floating Point Exceptions have been reported - 15,156,001,082 cycles:u # 3.407 GHz (75.01%) - 9,664,661 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) - 868,948,815 stalled-cycles-backend:u # 5.73% backend cycles idle (75.01%) - 39,292,243,946 instructions:u # 2.59 insn per cycle - # 0.02 stalled cycles per insn (75.01%) - 4.453135404 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 747) (avx2: 0) (512y: 0) (512z: 0) + 15,275,610,730 cycles # 2.898 GHz + 38,585,204,587 instructions # 2.53 insn per cycle + 5.273127531 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 677) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 Avg ME (F77/C++) = 3.2340796721168488 Relative difference = 1.0138374786539113e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.506627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734964e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 2.507951 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.478780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.672331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.672331e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 3.124457 sec INFO: No Floating Point Exceptions have been reported - 8,429,880,809 cycles:u # 3.347 GHz (74.92%) - 8,940,501 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.91%) - 1,301,822,604 stalled-cycles-backend:u # 15.44% backend cycles idle (74.92%) - 24,149,695,606 instructions:u # 2.86 insn per cycle - # 0.05 stalled cycles per insn (74.92%) - 2.523103375 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) + 8,951,368,692 cycles # 2.862 GHz + 24,230,346,765 instructions # 2.71 insn per cycle + 3.129932357 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 Avg ME (F77/C++) = 3.2340796721168488 Relative difference = 1.0138374786539113e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.750025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.357405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.357405e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 1.533704 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.646169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.144963e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.144963e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 1.966588 sec +INFO: No Floating Point Exceptions have been reported + 5,394,193,630 cycles # 2.737 GHz + 11,282,079,100 instructions # 2.09 insn per cycle + 1.972075346 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2483) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.312770e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.933844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.933844e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 1.769300 sec INFO: No Floating Point Exceptions have been reported - 5,071,742,396 cycles:u # 3.283 GHz (74.70%) - 12,094,843 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.84%) - 1,405,067,296 stalled-cycles-backend:u # 27.70% backend cycles idle (75.10%) - 11,364,876,035 instructions:u # 2.24 insn per cycle - # 0.12 stalled cycles per insn (75.15%) - 1.549168658 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2470) (512y: 0) (512z: 0) + 4,855,634,573 cycles # 2.737 GHz + 10,529,908,188 instructions # 2.17 insn per cycle + 1.774939787 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2170) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796844996675 -Relative difference = 9.755489429022839e-08 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779051e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.993953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.993953e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.883837 sec +INFO: No Floating Point Exceptions have been reported + 5,232,692,174 cycles # 1.812 GHz + 7,609,089,901 instructions # 1.45 insn per cycle + 2.889504238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1611) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 5f5fc8dfa6..62b069d661 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -1,175 +1,218 @@ -Building in /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx -BACKEND=cppavx2 (was cppauto) +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx +BACKEND=cpp512y (was cppauto) OMPFLAGS=-fopenmp FPTYPE='d' HELINL='0' HRDCOD='0' -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR == 1) +Building in BUILDDIR=build.auto_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1) make: Nothing to be done for 'gtestlibs'. -make USEBUILDDIR=1 BACKEND=hip -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 BACKEND=cuda +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppnone -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppsse4 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cppavx2 -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512y -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 BACKEND=cpp512z -make[1]: Entering directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep3/scratch/project_465001114/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-05-15_16:42:48 +DATE: 2024-05-16_15:17:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/gcheck.exe -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.856097e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.930499e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.983942e+07 ) sec^-1 -MeanMatrixElemValue = ( 3.295059e+00 +- 3.224567e-03 ) GeV^0 -TOTAL : 0.370270 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.743856e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168884e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279553e+08 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 0.531580 sec INFO: No Floating Point Exceptions have been reported - 893,302,871 cycles:u # 2.248 GHz (74.53%) - 2,064,846 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.73%) - 5,563,392 stalled-cycles-backend:u # 0.62% backend cycles idle (75.97%) - 1,398,072,822 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (76.03%) - 0.422543554 seconds time elapsed + 2,155,818,187 cycles # 2.818 GHz + 3,085,690,683 instructions # 1.43 insn per cycle + 0.823819066 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181671 -Relative difference = 1.2865539287460837e-07 +Avg ME (F77/GPU) = 3.2340795839181666 +Relative difference = 1.2865539301192385e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.427184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.486616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.486616e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 4.501563 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.002464e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060011e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 5.344548 sec INFO: No Floating Point Exceptions have been reported - 15,471,361,613 cycles:u # 3.428 GHz (75.01%) - 9,587,219 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) - 53,435,497 stalled-cycles-backend:u # 0.35% backend cycles idle (75.01%) - 40,101,852,759 instructions:u # 2.59 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 4.518616180 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) + 15,331,700,326 cycles # 2.866 GHz + 40,369,778,421 instructions # 2.63 insn per cycle + 5.350011304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 Avg ME (F77/C++) = 3.2340796721168488 Relative difference = 1.0138374786539113e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.491073e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.718650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.718650e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 2.516438 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.555017e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.755921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.755921e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 3.059082 sec INFO: No Floating Point Exceptions have been reported - 8,513,144,488 cycles:u # 3.366 GHz (75.02%) - 9,669,053 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.01%) - 1,081,416,989 stalled-cycles-backend:u # 12.70% backend cycles idle (75.01%) - 23,544,538,216 instructions:u # 2.77 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 2.532731041 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1992) (avx2: 0) (512y: 0) (512z: 0) + 8,522,277,742 cycles # 2.782 GHz + 23,253,428,254 instructions # 2.73 insn per cycle + 3.064709896 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2090) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 Avg ME (F77/C++) = 3.2340796721168488 Relative difference = 1.0138374786539113e-07 OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW -Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 6.853773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.323472e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.323472e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.287639e+00 +- 3.218203e-03 ) GeV^0 -TOTAL : 1.712400 sec +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.699594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.044812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.044812e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.340476 sec +INFO: No Floating Point Exceptions have been reported + 6,239,696,903 cycles # 2.661 GHz + 12,963,096,678 instructions # 2.08 insn per cycle + 2.346005075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2668) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 +OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.032659e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.430530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.430530e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 2.191599 sec INFO: No Floating Point Exceptions have been reported - 5,695,677,441 cycles:u # 3.302 GHz (74.96%) - 11,580,613 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.95%) - 725,805,658 stalled-cycles-backend:u # 12.74% backend cycles idle (74.95%) - 13,041,698,391 instructions:u # 2.29 insn per cycle - # 0.06 stalled cycles per insn (74.97%) - 1.728510178 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 0) (512z: 0) + 5,901,015,524 cycles # 2.687 GHz + 12,238,387,260 instructions # 2.07 insn per cycle + 2.197121947 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2208) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796844996675 -Relative difference = 9.755489429022839e-08 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 OK (relative difference <= 5E-3) +========================================================================= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= +INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW +Process = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.554826e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.745267e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745267e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 +TOTAL : 3.058656 sec +INFO: No Floating Point Exceptions have been reported + 5,596,491,041 cycles # 1.827 GHz + 8,743,545,379 instructions # 1.56 insn per cycle + 3.064278596 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1908) ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2024/madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 +Avg ME (C++/C++) = 3.234080e+00 +Avg ME (F77/C++) = 3.2340796772295590 +Relative difference = 9.980286234148268e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/simdSym.sh b/epochX/cudacpp/tput/simdSym.sh index 36ce0a868c..057b218a33 100755 --- a/epochX/cudacpp/tput/simdSym.sh +++ b/epochX/cudacpp/tput/simdSym.sh @@ -2,6 +2,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #---------------------------------------------------------------------------------------------- # See http://sponce.web.cern.ch/sponce/CSC/slides/PracticalVectorization.booklet.pdf @@ -38,7 +39,7 @@ function countSyms() { printf "(%24s : %4d) " "'$sym'" $(cat $dump | awk '/^ +[[:xdigit:]]+:\t/' | cut -f3- | egrep "$sym" | wc -l) done; printf "\n" } -#dump=$1; shift; countSyms $* # for debugging (./simdSym.sh ./build.none/CPPProcess.o.objdump 'addsd.*xmm') +#dump=$1; shift; countSyms $* # for debugging (./simdSym.sh ./build.none/CPPProcess_cpp.o.objdump 'addsd.*xmm') function mainCountSyms() { # Command line arguments: select file @@ -108,7 +109,7 @@ mainListSyms $* function mainCompareSyms() { allFileSyms="" for avx in none sse4 avx2 512y 512z; do - file=./build.$avx/CPPProcess.o + file=./build.$avx/CPPProcess_cpp.o fileSymsRaw=$file.symlist.raw mainListSyms $file > $fileSymsRaw ls -l $fileSymsRaw @@ -117,7 +118,7 @@ function mainCompareSyms() { allSymList=all.symlist cat $allFileSyms | sort -u | awk -vf1= -vf2= -vf3=0 '{if ($1==f1 && $2==f2){f3+=$3} else {if(f1!=""){print f1,f2,f3};f1=$1;f2=$2;f3=$3}}END{print f1,f2,f3}' | sort -u -k 1,2 > $allSymList for avx in none sse4 avx2 512y 512z; do - file=./build.$avx/CPPProcess.o + file=./build.$avx/CPPProcess_cpp.o fileSymsRaw=$file.symlist.raw fileSyms=$file.symlist cat $fileSymsRaw | awk -vall=$allSymList -vfmt="%5s %15s %5d\n" -vtot=0 -veof=1 '{f1=$1; f2=$2; f3=$3; tot+=f3; while(f3>0){getline < all; if($1==f1 && $2==f2){printf fmt,$1,$2,f3; f3=0} else {printf fmt,$1,$2,0}}} END{while(getline < all){printf fmt,$1,$2,0};printf fmt,"TOTAL","",tot}' > $fileSyms diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 71376e9b08..9cbfd9d74f 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -219,7 +219,7 @@ while [ "$1" != "" ]; do detailed=1 shift elif [ "$1" == "-gtest" ]; then - # For simplicity a gtest runTest.exe is executed for each build where check.exe or gcheck.exe is executed + # For simplicity a gtest runTest_xxx.exe is executed for each build where check_xxx.exe is executed gtest=1 shift ###elif [ "$1" == "-nofpe" ]; then @@ -393,9 +393,9 @@ for dir in $dirs; do for bbld in cuda hip none sse4 avx2 512y 512z; do if [ "${bblds}" == "${bbldsall}" ] || [ "${bblds/${bbld}}" != "${bblds}" ]; then if [ "${bbld}" == "cuda" ] || [ "${bbld}" == "hip" ]; then - exes="$exes $dir/build.${bbld}_${fptype}_inl${helinl}${hrdsuf}/gcheck.exe" + exes="$exes $dir/build.${bbld}_${fptype}_inl${helinl}${hrdsuf}/check_${bbld}.exe" else - exes="$exes $dir/build.${bbld}_${fptype}_inl${helinl}${hrdsuf}/check.exe" + exes="$exes $dir/build.${bbld}_${fptype}_inl${helinl}${hrdsuf}/check_cpp.exe" fi fi done @@ -479,7 +479,7 @@ function runExe() { if [ "${maketype}" == "-dryrun" ]; then return; fi pattern="Process|fptype_sv|OMP threads|EvtsPerSec\[MECalc|MeanMatrix|FP precision|TOTAL :" # Optionally add other patterns here for some specific configurations (e.g. clang) - if [ "${exe%%/gcheck*}" != "${exe}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi + if [ "${exe%%/check_cuda*}" != "${exe}" ] || [ "${exe%%/check_hip*}" != "${exe}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi pattern="${pattern}|Workflow" ###pattern="${pattern}|CUCOMPLEX" ###pattern="${pattern}|COMMON RANDOM|CURAND HOST \(CUDA" @@ -511,7 +511,6 @@ function runExe() { function cmpExe() { exe=$1 exef=${exe/\/check//fcheck} - exef=${exef/\/gcheck//fgcheck} argsf="2 64 2" args="--common -p ${argsf}" echo "cmpExe $exe $args" @@ -520,7 +519,7 @@ function cmpExe() { tmp=$(mktemp) me1=$(${exe} ${args} 2>${tmp} | grep MeanMatrix | awk '{print $4}'); cat ${tmp} me2=$(${exef} ${argsf} 2>${tmp} | grep Average | awk '{print $4}'); cat ${tmp} - if [ "${exe%%/gcheck*}" != "${exe}" ]; then tag="/GPU)"; else tag="/C++) "; fi + if [ "${exe%%/check_cuda*}" != "${exe}" ] || [ "${exe%%/check_hip*}" != "${exe}" ]; then tag="/GPU)"; else tag="/C++) "; fi echo -e "Avg ME (C++${tag} = ${me1}\nAvg ME (F77${tag} = ${me2}" if [ "${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77${tag} returned NaN" @@ -615,14 +614,17 @@ echo -e "On $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:" BMKEXEARGS="" # if BMKEXEARGS is set, exeArgs is set equal to BMKEXEARGS, while exeArgs2 is set to "" BMKMULTIPLIER=1 # the pre-defined numbers of iterations (including those in BMKEXEARGS) are multiplied by BMKMULTIPLIER -lastExe= +###lastExe= +lastExeDir= ###echo "exes=$exes" for exe in $exes; do ###echo EXE=$exe; continue exeArgs2="" - if [ "$(basename $exe)" != "$lastExe" ]; then + ###if [ "$(basename $exe)" != "$lastExe" ]; then + if [ "$(basename $(dirname $exe))" != "$lastExeDir" ]; then echo "=========================================================================" - lastExe=$(basename $exe) + ###lastExe=$(basename $exe) + lastExeDir=$(basename $(dirname $exe)) else echo "-------------------------------------------------------------------------" fi @@ -638,7 +640,7 @@ for exe in $exes; do if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi fi - if [ "${exe%%/gcheck*}" != "${exe}" ] && [ "$gpuTxt" == "none" ]; then continue; fi + if [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] && [ "$gpuTxt" == "none" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi if [ "${exe%%/heft_gg_bb*}" != "${exe}" ]; then # For heftggbb, use the same settings as for ggtt exeArgs="-p 2048 256 2" @@ -698,9 +700,9 @@ for exe in $exes; do cd $exeDir/.. # workaround for reading '../../Cards/param_card.dat' without setting MG5AMC_CARD_PATH unset OMP_NUM_THREADS runExe $exe "$exeArgs" - if [ "${exe%%/check*}" != "${exe}" ]; then + if [ "${exe%%/check_cpp*}" != "${exe}" ]; then if [ "${maketype}" != "-dryrun" ]; then - obj=${exe%%/check*}/CPPProcess.o; $scrdir/simdSymSummary.sh -stripdir ${obj} -dumptotmp # comment out -dumptotmp to keep full objdump + obj=${exe%%.exe}; obj=${obj/check/CPPProcess}.o; $scrdir/simdSymSummary.sh -stripdir ${obj} -dumptotmp # comment out -dumptotmp to keep full objdump fi if [ "${omp}" == "1" ]; then echo "-------------------------------------------------------------------------" @@ -708,7 +710,7 @@ for exe in $exes; do runExe $exe "$exeArgs" unset OMP_NUM_THREADS fi - elif [ "${exe%%/gcheck*}" != "${exe}" ] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then + elif [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then runNcu $exe "$ncuArgs" if [ "${div}" == "1" ]; then runNcuDiv $exe; fi if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi @@ -716,8 +718,7 @@ for exe in $exes; do fi if [ "${gtest}" == "1" ]; then echo "-------------------------------------------------------------------------" - exe2=${exe/gcheck/runTest} # first try to replace gcheck.exe - exe2=${exe2/check/runTest} # then try to replace check.exe instead + exe2=${exe/check/runTest} # replace check_xxx.exe by runTest_xxx.exe echo "runExe $exe2" $exe2 2>&1 | tail -1 if [ ${PIPESTATUS[0]} -ne "0" ]; then exit 1; fi