From 45ea34b52d5d397e569fe2bc8d5c2f0ff15bef04 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 6 Jul 2022 19:32:49 -0400 Subject: [PATCH 01/57] Start rdf gpu kernel --- src/cuda_kernels/kernel_rdf.cu | 33 +++++++++++++++++++++++++++++++++ src/cuda_kernels/kernel_rdf.cuh | 6 ++++++ 2 files changed, 39 insertions(+) create mode 100644 src/cuda_kernels/kernel_rdf.cu create mode 100644 src/cuda_kernels/kernel_rdf.cuh diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu new file mode 100644 index 0000000000..b765dc1078 --- /dev/null +++ b/src/cuda_kernels/kernel_rdf.cu @@ -0,0 +1,33 @@ +#include "kernel_rdf.cuh" +#if defined(__HIP_PLATFORM_HCC__) +#include +#include "../HipDefinitions.h" +#endif + +#define BLOCKDIM 512 + +/** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ +void Cpptraj_GPU_RDF(int* bins, + const double* xyz1, int N1, + const double* xyz2, int N2, + ImageOption::Type imageType, + const double* box, const double* ucell, const double* recip) +{ + double* device_xyz1; + cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)); + + double* device_xyz2; + cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)); + + double *boxDev; + double *ucellDev, *recipDev; + if (imageType == ImageOption::ORTHO) { + cudaMalloc(((void**)(&boxDev)), 3 * sizeof(double)); + cudaMemcpy(boxDev,box, 3 * sizeof(double), cudaMemcpyHostToDevice); + } else if (imageType == ImageOption::NONORTHO) { + cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(double)); + cudaMalloc(((void**)(&recipDev)), 9 * sizeof(double)); + cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(recipDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice); + } +} diff --git a/src/cuda_kernels/kernel_rdf.cuh b/src/cuda_kernels/kernel_rdf.cuh new file mode 100644 index 0000000000..4375eef027 --- /dev/null +++ b/src/cuda_kernels/kernel_rdf.cuh @@ -0,0 +1,6 @@ +#ifndef INC_KERNEL_RDF_CUH +#define INC_KERNEL_RDF_CUH +#include "../ImageOption.h" +void Cpptraj_GPU_RDF(int*, const double*, int, const double*, int, + ImageOption::Type, const double*, const double*, const double*); +#endif From a1030d675bb775ebf1409b18d37dc11419bf098a Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 08:51:52 -0400 Subject: [PATCH 02/57] Add rdf entry function to Makefile --- src/cuda_kernels/Makefile | 2 +- src/cuda_kernels/cudadepend | 1 + src/cuda_kernels/kernel_rdf.cu | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/Makefile b/src/cuda_kernels/Makefile index 68a05651d8..835f53ae01 100644 --- a/src/cuda_kernels/Makefile +++ b/src/cuda_kernels/Makefile @@ -13,7 +13,7 @@ TARGET = libcpptraj_cuda.a # Source files -CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu +CUDA_SOURCES=core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu kernel_rdf.cu # Objects diff --git a/src/cuda_kernels/cudadepend b/src/cuda_kernels/cudadepend index b931699125..7866820ec6 100644 --- a/src/cuda_kernels/cudadepend +++ b/src/cuda_kernels/cudadepend @@ -1,4 +1,5 @@ GistCudaCalc.o : GistCudaCalc.cu GistCudaCalc.cuh GistCudaSetup.o : GistCudaSetup.cu ../HipDefinitions.h GistCudaCalc.cuh GistCudaSetup.cuh core_kernels.o : core_kernels.cu core_kernels.cuh +kernel_rdf.o : kernel_rdf.cu ../CpptrajStdio.h ../HipDefinitions.h ../ImageOption.h kernel_rdf.cuh kernel_wrappers.o : kernel_wrappers.cu ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_wrappers.cuh diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index b765dc1078..8623bec96e 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -1,4 +1,5 @@ #include "kernel_rdf.cuh" +#include "../CpptrajStdio.h" #if defined(__HIP_PLATFORM_HCC__) #include #include "../HipDefinitions.h" @@ -30,4 +31,11 @@ void Cpptraj_GPU_RDF(int* bins, cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(recipDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice); } + + // Determine number of blocks + dim3 threadsPerBlock(BLOCKDIM, BLOCKDIM); + dim3 numBlocks(N1 / threadsPerBlock.x, N2 / threadsPerBlock.y); + mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", + N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); + } From cd046c6d4ea95208be262e7bf600f0b644837d32 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 15:48:52 -0400 Subject: [PATCH 03/57] Add test for RDF --- src/Action_Radial.cpp | 32 ++++++++++++++++++++++++++++++++ src/cpptrajdepend | 2 +- src/cuda_kernels/kernel_rdf.cu | 2 +- src/cuda_kernels/kernel_rdf.cuh | 2 +- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 30cb36cf91..4f4ad7dc3a 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -7,6 +7,9 @@ #ifdef _OPENMP # include #endif +#ifdef CUDA +# include "cuda_kernels/kernel_rdf.cuh" +#endif // CONSTRUCTOR Action_Radial::Action_Radial() : @@ -439,6 +442,21 @@ Action::RetType Action_Radial::Setup(ActionSetup& setup) { return Action::OK; } +#ifdef CUDA +static inline std::vector mask_to_xyz(AtomMask const& Mask, Frame const& frm) +{ + std::vector outerxyz; + outerxyz.reserve( Mask.Nselected()*3 ); + for (AtomMask::const_iterator at = Mask.begin(); at != Mask.end(); ++at) { + const double* xyz = frm.XYZ( *at ); + outerxyz.push_back( xyz[0] ); + outerxyz.push_back( xyz[1] ); + outerxyz.push_back( xyz[2] ); + } + return outerxyz; +} +#endif + // Action_Radial::DoAction() /** Calculate distances from atoms in mask1 to atoms in mask 2 and * bin them. @@ -458,6 +476,19 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { if (useVolume_) volume_ += frm.Frm().BoxCrd().CellVolume(); // --------------------------------------------- +#ifdef CUDA + // Copy atoms FIXME need to do overlapping and non-overlapping case + std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); + std::vector innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); + Cpptraj_GPU_RDF( &RDF_[0], + &outerxyz[0], OuterMask_.Nselected(), + &innerxyz[0], InnerMask_.Nselected(), + imageOpt_.ImagingType(), + frm.Frm().BoxCrd().XyzPtr(), + frm.Frm().BoxCrd().UnitCell().Dptr(), + frm.Frm().BoxCrd().FracCell().Dptr() ); +#else /* CUDA */ + // --------------------------------------------- if ( rmode_ == NORMAL ) { // Calculation of all atoms in Mask1 to all atoms in Mask2 int outer_max = OuterMask_.Nselected(); @@ -602,6 +633,7 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { } // END pragma omp parallel # endif } +#endif /* CUDA */ ++numFrames_; return Action::OK; diff --git a/src/cpptrajdepend b/src/cpptrajdepend index 9b806a7d97..4bf312017a 100644 --- a/src/cpptrajdepend +++ b/src/cpptrajdepend @@ -64,7 +64,7 @@ Action_Principal.o : Action_Principal.cpp Action.h ActionState.h Action_Principa Action_Projection.o : Action_Projection.cpp Action.h ActionFrameCounter.h ActionState.h Action_Projection.h ArgList.h Array1D.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_2D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_MatrixDbl.h DataSet_Modes.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_Pucker.o : Action_Pucker.cpp Action.h ActionState.h Action_Pucker.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TorsionRoutines.h TypeNameHolder.h Unit.h Vec3.h Action_Radgyr.o : Action_Radgyr.cpp Action.h ActionState.h Action_Radgyr.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h -Action_Radial.o : Action_Radial.cpp Action.h ActionState.h Action_Radial.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h +Action_Radial.o : Action_Radial.cpp Action.h ActionState.h Action_Radial.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h cuda_kernels/kernel_rdf.cuh Action_RandomizeIons.o : Action_RandomizeIons.cpp Action.h ActionState.h Action_RandomizeIons.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Random.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_Remap.o : Action_Remap.cpp Action.h ActionState.h ActionTopWriter.h Action_Remap.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_ReplicateCell.o : Action_ReplicateCell.cpp Action.h ActionFrameCounter.h ActionState.h ActionTopWriter.h Action_ReplicateCell.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h OutputTrajCommon.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajectoryFile.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 8623bec96e..7ed889dcba 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -8,7 +8,7 @@ #define BLOCKDIM 512 /** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ -void Cpptraj_GPU_RDF(int* bins, +void Cpptraj_GPU_RDF(unsigned long* bins, const double* xyz1, int N1, const double* xyz2, int N2, ImageOption::Type imageType, diff --git a/src/cuda_kernels/kernel_rdf.cuh b/src/cuda_kernels/kernel_rdf.cuh index 4375eef027..c1c98ae71b 100644 --- a/src/cuda_kernels/kernel_rdf.cuh +++ b/src/cuda_kernels/kernel_rdf.cuh @@ -1,6 +1,6 @@ #ifndef INC_KERNEL_RDF_CUH #define INC_KERNEL_RDF_CUH #include "../ImageOption.h" -void Cpptraj_GPU_RDF(int*, const double*, int, const double*, int, +void Cpptraj_GPU_RDF(unsigned long*, const double*, int, const double*, int, ImageOption::Type, const double*, const double*, const double*); #endif From d849cf58c888ee6b88235b54c922801f09aed136 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 16:15:45 -0400 Subject: [PATCH 04/57] Fix num blocks calc --- src/cuda_kernels/kernel_rdf.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 7ed889dcba..5e36e2f027 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -7,6 +7,14 @@ #define BLOCKDIM 512 +static inline int calc_nblocks(int ntotal, int nthreadsPerBlock) +{ + int nblocks = ntotal / nthreadsPerBlock; + if ( (ntotal % nthreadsPerBlock) != 0 ) + nblocks++; + return nblocks; +} + /** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ void Cpptraj_GPU_RDF(unsigned long* bins, const double* xyz1, int N1, @@ -34,7 +42,7 @@ void Cpptraj_GPU_RDF(unsigned long* bins, // Determine number of blocks dim3 threadsPerBlock(BLOCKDIM, BLOCKDIM); - dim3 numBlocks(N1 / threadsPerBlock.x, N2 / threadsPerBlock.y); + dim3 numBlocks(calc_nblocks(N1, threadsPerBlock.x), calc_nblocks(N2, threadsPerBlock.y)); mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); From 39a1b3a220799e903c4b723b6dfd9d7bf6dda743 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 19:28:14 -0400 Subject: [PATCH 05/57] Change block dim to 32 to avoid creating too many threads per block. Start the rdf bin kernel --- src/cuda_kernels/core_kernels.cu | 28 +++++++++++++++++++++++++++- src/cuda_kernels/kernel_rdf.cu | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 9ddb51794d..70c3cb2f1d 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -771,4 +771,30 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, } } - +// ----------------------------------------------------------------------------- +__global__ void kBinDistances_nonOverlap_nonOrtho(const double* xyz1, int N1, const double* xyz2, int N2, + const double* frac, const double* ucell) +{ + int a1 = blockIdx.x * blockDim.x + threadIdx.x; + int a2 = blockIdx.y * blockDim.y + threadIdx.y; + + if (a1 < N1 && a2 < N2) { + int idx1 = a1 * 3; + double a1x = xyz1[idx1 ]; + double a1y = xyz1[idx1+1]; + double a1z = xyz1[idx1+2]; + double f1x = frac[0]*a1x + frac[1]*a1y + frac[2]*a1z; + double f1y = frac[3]*a1x + frac[4]*a1y + frac[5]*a1z; + double f1z = frac[6]*a1x + frac[7]*a1y + frac[8]*a1z; + + int idx2 = a2 * 3; + double a2x = xyz2[idx2 ]; + double a2y = xyz2[idx2+1]; + double a2z = xyz2[idx2+2]; + double f2x = frac[0]*a2x + frac[1]*a2y + frac[2]*a2z; + double f2y = frac[3]*a2x + frac[4]*a2y + frac[5]*a2z; + double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; + + double dist2 = NonOrtho_dist(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); + } +} diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 5e36e2f027..9600e3e50a 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -5,7 +5,7 @@ #include "../HipDefinitions.h" #endif -#define BLOCKDIM 512 +#define BLOCKDIM 32 static inline int calc_nblocks(int ntotal, int nthreadsPerBlock) { From ff9fae491e8703e1024d0ed6e678bf63f743611d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 19:31:23 -0400 Subject: [PATCH 06/57] Add maximum2 and one_over_spacing --- src/cuda_kernels/core_kernels.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 70c3cb2f1d..ae04045f76 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -773,7 +773,8 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, // ----------------------------------------------------------------------------- __global__ void kBinDistances_nonOverlap_nonOrtho(const double* xyz1, int N1, const double* xyz2, int N2, - const double* frac, const double* ucell) + const double* frac, const double* ucell, + double maximum2, double one_over_spacing) { int a1 = blockIdx.x * blockDim.x + threadIdx.x; int a2 = blockIdx.y * blockDim.y + threadIdx.y; @@ -796,5 +797,9 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(const double* xyz1, int N1, co double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; double dist2 = NonOrtho_dist(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); + if (dist2 <= maximum2) { + double dist = sqrt(dist2); + int histIdx = (int) (dist * one_over_spacing); + } } } From f54d9cfd02ad30daa82b4208a0bd08716ee7bff1 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 7 Jul 2022 20:26:33 -0400 Subject: [PATCH 07/57] Add atomicAdd to the RDF histogram --- src/cuda_kernels/core_kernels.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index ae04045f76..bb5e04dc36 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -772,7 +772,8 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, } // ----------------------------------------------------------------------------- -__global__ void kBinDistances_nonOverlap_nonOrtho(const double* xyz1, int N1, const double* xyz2, int N2, +__global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, + const double* xyz1, int N1, const double* xyz2, int N2, const double* frac, const double* ucell, double maximum2, double one_over_spacing) { @@ -800,6 +801,7 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(const double* xyz1, int N1, co if (dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); + atomicAdd( RDF + histIdx, 1 ); } } } From a729aba17bebe419b0e1641c6ba45f5bc7bff5c8 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 08:01:52 -0400 Subject: [PATCH 08/57] Add kernel launch. Ensure device histogram memory is zeroed --- src/cuda_kernels/core_kernels.cuh | 3 +++ src/cuda_kernels/kernel_rdf.cu | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/cuda_kernels/core_kernels.cuh b/src/cuda_kernels/core_kernels.cuh index 3563c926d3..2a0bdc127f 100644 --- a/src/cuda_kernels/core_kernels.cuh +++ b/src/cuda_kernels/core_kernels.cuh @@ -13,4 +13,7 @@ __global__ void kClosestDistsToAtoms_Ortho(double*,const double*,const double*,d // Non-orthorhombic imaging __global__ void kClosestDistsToPt_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int); __global__ void kClosestDistsToAtoms_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int,int); +// RDF nonortho imaging +__global__ void kBinDistances_nonOverlap_nonOrtho(int*, const double*, int, const double*, int, + const double*, const double*, double, double); #endif diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 9600e3e50a..b022fbdf78 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -1,4 +1,5 @@ #include "kernel_rdf.cuh" +#include "core_kernels.cuh" #include "../CpptrajStdio.h" #if defined(__HIP_PLATFORM_HCC__) #include @@ -16,12 +17,16 @@ static inline int calc_nblocks(int ntotal, int nthreadsPerBlock) } /** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ -void Cpptraj_GPU_RDF(unsigned long* bins, +int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_over_spacing, const double* xyz1, int N1, const double* xyz2, int N2, ImageOption::Type imageType, const double* box, const double* ucell, const double* recip) { + int* device_rdf; + cudaMalloc(((void**)(&device_rdf)), nbins * sizeof(int)); + cudaMemset( &device_rdf, 0, nbins*sizeof(int) ); + double* device_xyz1; cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)); @@ -46,4 +51,15 @@ void Cpptraj_GPU_RDF(unsigned long* bins, mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); + // Launch kernel + switch (imageType) { + case ImageOption::NONORTHO: + kBinDistances_nonOverlap_nonOrtho<<>>( + device_rdf, device_xyz1, N1, device_xyz2, N2, recipDev, ucellDev, maximum2, one_over_spacing); + break; + default: + mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); + return 1; + } + return 0; } From 75c39135c974872b027035bed8780bb7a139ff49 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 08:10:03 -0400 Subject: [PATCH 09/57] Fix fxn definition and call --- src/Action_Radial.cpp | 2 +- src/cuda_kernels/kernel_rdf.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 4f4ad7dc3a..9fc1e99082 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -480,7 +480,7 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { // Copy atoms FIXME need to do overlapping and non-overlapping case std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); std::vector innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); - Cpptraj_GPU_RDF( &RDF_[0], + Cpptraj_GPU_RDF( &RDF_[0], RDF_.size(), maximum2_, one_over_spacing_, &outerxyz[0], OuterMask_.Nselected(), &innerxyz[0], InnerMask_.Nselected(), imageOpt_.ImagingType(), diff --git a/src/cuda_kernels/kernel_rdf.cuh b/src/cuda_kernels/kernel_rdf.cuh index c1c98ae71b..8f70904057 100644 --- a/src/cuda_kernels/kernel_rdf.cuh +++ b/src/cuda_kernels/kernel_rdf.cuh @@ -1,6 +1,6 @@ #ifndef INC_KERNEL_RDF_CUH #define INC_KERNEL_RDF_CUH #include "../ImageOption.h" -void Cpptraj_GPU_RDF(unsigned long*, const double*, int, const double*, int, +int Cpptraj_GPU_RDF(unsigned long*, int, double, double, const double*, int, const double*, int, ImageOption::Type, const double*, const double*, const double*); #endif From ea3c65810395f914c54051347a68e6c7bd715c61 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 08:55:45 -0400 Subject: [PATCH 10/57] Ensure results are copied back. Make sure coords are uploaded to device. --- src/cuda_kernels/core_kernels.cu | 5 ++++ src/cuda_kernels/kernel_rdf.cu | 22 +++++++++++++++++ test/Test_Radial/watO-protein.agr.save | 28 ++++++++++++++++++++++ test/Test_Radial/watO-protein.raw.agr.save | 28 ++++++++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 test/Test_Radial/watO-protein.agr.save create mode 100644 test/Test_Radial/watO-protein.raw.agr.save diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index bb5e04dc36..f6eb8ffe5c 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -1,4 +1,5 @@ #include "core_kernels.cuh" +//#incl ude // DEBUG #define BLOCKDIM 512 #define RSIZE 512 @@ -801,6 +802,10 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, if (dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); + //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i xyz1=%f %f %f xyz2=%f %f %f\n", a1+1, a2+1, dist, histIdx, + // a1x, a1y, a1z, a2x, a2y, a2z); atomicAdd( RDF + histIdx, 1 ); } } diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index b022fbdf78..7bbd103043 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -29,9 +29,11 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ double* device_xyz1; cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)); + cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice); double* device_xyz2; cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)); + cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice); double *boxDev; double *ucellDev, *recipDev; @@ -61,5 +63,25 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); return 1; } + + // Copy the result back + int* local_bins = new int[ nbins ]; + cudaMemcpy(local_bins, device_rdf, nbins*sizeof(int), cudaMemcpyDeviceToHost); + for (int ibin = 0; ibin != nbins; ibin++) { + //mprintf("DEBUG:\tBin %i = %i (%i)\n", ibin, local_bins[ibin], device_rdf[ibin]); + //mprintf("DEBUG:\tBin %i = %i\n", ibin, local_bins[ibin]); + bins[ibin] += local_bins[ibin]; + } + delete[] local_bins; + // Free device memory + cudaFree(device_rdf); + cudaFree(device_xyz1); + cudaFree(device_xyz2); + if (imageType == ImageOption::ORTHO) + cudaFree(boxDev); + else if (imageType == ImageOption::NONORTHO) { + cudaFree(ucellDev); + cudaFree(recipDev); + } return 0; } diff --git a/test/Test_Radial/watO-protein.agr.save b/test/Test_Radial/watO-protein.agr.save new file mode 100644 index 0000000000..4cd421db81 --- /dev/null +++ b/test/Test_Radial/watO-protein.agr.save @@ -0,0 +1,28 @@ +@with g0 +@ xaxis label "Distance (Ang)" +@ yaxis label "" +@ legend 0.2, 0.995 +@ legend char size 0.60 +@ s0 legend ":WAT@O => ^1" +@target G0.S0 +@type xy + 0.250 0.000000 + 0.750 0.000000 + 1.250 0.000000 + 1.750 0.049091 + 2.250 0.089329 + 2.750 0.322212 + 3.250 0.414760 + 3.750 0.512819 + 4.250 0.514178 + 4.750 0.507471 + 5.250 0.597354 + 5.750 0.578439 + 6.250 0.629614 + 6.750 0.672183 + 7.250 0.697020 + 7.750 0.704304 + 8.250 0.779077 + 8.750 0.807241 + 9.250 0.825183 + 9.750 0.839616 diff --git a/test/Test_Radial/watO-protein.raw.agr.save b/test/Test_Radial/watO-protein.raw.agr.save new file mode 100644 index 0000000000..8f568345cb --- /dev/null +++ b/test/Test_Radial/watO-protein.raw.agr.save @@ -0,0 +1,28 @@ +@with g0 +@ xaxis label "Distance (Ang)" +@ yaxis label "" +@ legend 0.2, 0.995 +@ legend char size 0.60 +@ s0 legend "Raw[:WAT@O => ^1]" +@target G0.S0 +@type xy + 0.250 0.000000 + 0.750 0.000000 + 1.250 0.000000 + 1.750 7.000000 + 2.250 21.000000 + 2.750 113.000000 + 3.250 203.000000 + 3.750 334.000000 + 4.250 430.000000 + 4.750 530.000000 + 5.250 762.000000 + 5.750 885.000000 + 6.250 1138.000000 + 6.750 1417.000000 + 7.250 1695.000000 + 7.750 1957.000000 + 8.250 2453.000000 + 8.750 2859.000000 + 9.250 3266.000000 + 9.750 3692.000000 From e00293ffd260c80c4810f0faeb856426fe234e62 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 09:27:15 -0400 Subject: [PATCH 11/57] Add check that spacing is not greater than the max --- src/Action_Radial.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 9fc1e99082..7be8d941c3 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -122,6 +122,10 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d Help(); return Action::ERR; } + if (spacing_ > maximum) { + mprinterr("Error: Bin spacing %g is larger than the maximum %g\n", spacing_, maximum); + return Action::ERR; + } // Store max^2, distances^2 greater than max^2 do not need to be // binned and therefore do not need a sqrt calc. maximum2_ = maximum * maximum; From 6fcd2830d5af1cd19e6085f3f11ea760f7a0f0a5 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 10:02:17 -0400 Subject: [PATCH 12/57] Add separate test --- test/Test_Radial/Test.sh | 30 +++++++++++++++++++ test/Test_Radial/watO-protein.agr.save | 34 +++++++++++----------- test/Test_Radial/watO-protein.raw.agr.save | 34 +++++++++++----------- 3 files changed, 64 insertions(+), 34 deletions(-) create mode 100755 test/Test_Radial/Test.sh diff --git a/test/Test_Radial/Test.sh b/test/Test_Radial/Test.sh new file mode 100755 index 0000000000..5e8d2edaf8 --- /dev/null +++ b/test/Test_Radial/Test.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +. ../MasterTest.sh + +CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ + WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ + WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ + point?.agr wat.origin.agr \ + watO-protein.agr watO-protein.raw.agr + +TESTNAME='Radial tests' +Requires netcdf maxthreads 10 + +INPUT="-i radial.in" + +UNITNAME='Radial test, non-orthogonal imaging' +cat > radial.in < Date: Fri, 8 Jul 2022 10:08:22 -0400 Subject: [PATCH 13/57] Enable some debug info. Do kernel error checking --- src/cuda_kernels/core_kernels.cu | 6 +++--- src/cuda_kernels/kernel_rdf.cu | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index f6eb8ffe5c..7b76f97617 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -1,5 +1,5 @@ #include "core_kernels.cuh" -//#incl ude // DEBUG +#include // DEBUG #define BLOCKDIM 512 #define RSIZE 512 @@ -802,8 +802,8 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, if (dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); - //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); - //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); + printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); + printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i xyz1=%f %f %f xyz2=%f %f %f\n", a1+1, a2+1, dist, histIdx, // a1x, a1y, a1z, a2x, a2y, a2z); atomicAdd( RDF + histIdx, 1 ); diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 7bbd103043..c7b5d403ce 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -63,6 +63,12 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); return 1; } + // Error check + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + mprinterr("CUDA Error: %s\n", cudaGetErrorString(err)); + return 1; + } // Copy the result back int* local_bins = new int[ nbins ]; From d560184ea78ebbc42658f573a2ee3877ed421403 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 10:18:43 -0400 Subject: [PATCH 14/57] Report max threads per block --- src/Cpptraj.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index 5d94b8dacb..c5ada3e06f 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -122,6 +122,7 @@ void Cpptraj::Intro() const { mprintf("| CUDA device: %s\n", deviceProp.name); mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); + mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); } } # endif From 50dd4eb5ec44d6da7d6300a2bfa5204434484824 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 10:36:30 -0400 Subject: [PATCH 15/57] Hide some debug info. Make cuda errors non-fatal while debugging --- src/cuda_kernels/core_kernels.cu | 6 +++--- src/cuda_kernels/kernel_rdf.cu | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 7b76f97617..69e293840d 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -1,5 +1,5 @@ #include "core_kernels.cuh" -#include // DEBUG +//#include // DEBUG #define BLOCKDIM 512 #define RSIZE 512 @@ -802,8 +802,8 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, if (dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); - printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); - printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); + //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i xyz1=%f %f %f xyz2=%f %f %f\n", a1+1, a2+1, dist, histIdx, // a1x, a1y, a1z, a2x, a2y, a2z); atomicAdd( RDF + histIdx, 1 ); diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index c7b5d403ce..7e94eb42c4 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -66,8 +66,9 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ // Error check cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { + mprintf("CUDA Error: %s\n", cudaGetErrorString(err)); mprinterr("CUDA Error: %s\n", cudaGetErrorString(err)); - return 1; + //return 1; } // Copy the result back From 84aa280776358c88fb13335d50e615ed60bd605d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 11:08:04 -0400 Subject: [PATCH 16/57] Better labeling --- src/cuda_kernels/kernel_rdf.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 7e94eb42c4..86b99569f7 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -66,8 +66,8 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ // Error check cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - mprintf("CUDA Error: %s\n", cudaGetErrorString(err)); - mprinterr("CUDA Error: %s\n", cudaGetErrorString(err)); + mprintf("Warning: CUDA Error: %s\n", cudaGetErrorString(err)); + mprinterr("Error: CUDA Error: %s\n", cudaGetErrorString(err)); //return 1; } From 957af26ff510e682473b3826733c29bcfb7a790c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 11:23:33 -0400 Subject: [PATCH 17/57] Improve error checking for memory allocation and setting. Correct invalid arg to cudaMemset --- src/cuda_kernels/kernel_rdf.cu | 42 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 86b99569f7..112476e657 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -16,6 +16,17 @@ static inline int calc_nblocks(int ntotal, int nthreadsPerBlock) return nblocks; } +/** Report any cuda errors. */ +static inline int Cuda_check(cudaError_t err, const char* desc) { + //cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + mprintf("Warning: CUDA Error %s: %s\n", desc, cudaGetErrorString(err)); + mprinterr("Error: CUDA Error %s: %s\n", desc, cudaGetErrorString(err)); + //return 1; + } + return 0; +} + /** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_over_spacing, const double* xyz1, int N1, @@ -24,27 +35,27 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ const double* box, const double* ucell, const double* recip) { int* device_rdf; - cudaMalloc(((void**)(&device_rdf)), nbins * sizeof(int)); - cudaMemset( &device_rdf, 0, nbins*sizeof(int) ); + Cuda_check(cudaMalloc(((void**)(&device_rdf)), nbins * sizeof(int)), "Allocating rdf bins"); + Cuda_check(cudaMemset( device_rdf, 0, nbins*sizeof(int) ), "Setting rdf bins to 0"); double* device_xyz1; - cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)); - cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice); + Cuda_check(cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)), "Allocating xyz1"); + Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz1"); double* device_xyz2; - cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)); - cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice); + Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); + Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); double *boxDev; double *ucellDev, *recipDev; if (imageType == ImageOption::ORTHO) { - cudaMalloc(((void**)(&boxDev)), 3 * sizeof(double)); - cudaMemcpy(boxDev,box, 3 * sizeof(double), cudaMemcpyHostToDevice); + Cuda_check(cudaMalloc(((void**)(&boxDev)), 3 * sizeof(double)), "Allocating box"); + Cuda_check(cudaMemcpy(boxDev,box, 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying box"); } else if (imageType == ImageOption::NONORTHO) { - cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(double)); - cudaMalloc(((void**)(&recipDev)), 9 * sizeof(double)); - cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(recipDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice); + Cuda_check(cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(double)), "Allocating ucell"); + Cuda_check(cudaMalloc(((void**)(&recipDev)), 9 * sizeof(double)), "Allocating frac"); + Cuda_check(cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying ucell"); + Cuda_check(cudaMemcpy(recipDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying frac"); } // Determine number of blocks @@ -64,12 +75,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ return 1; } // Error check - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - mprintf("Warning: CUDA Error: %s\n", cudaGetErrorString(err)); - mprinterr("Error: CUDA Error: %s\n", cudaGetErrorString(err)); - //return 1; - } + Cuda_check(cudaGetLastError(), "kernel launch"); // Copy the result back int* local_bins = new int[ nbins ]; From 99ec3c48b8fa2ad014fae9babc98b191a411d521 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:04:26 -0400 Subject: [PATCH 18/57] Report max threads in each dim --- src/Cpptraj.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index c5ada3e06f..65448b67c5 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -123,6 +123,10 @@ void Cpptraj::Intro() const { mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); + mprintf("| Max threads dim: %i %i %i\n", + deviceProp.maxThreadsDim[0], + deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); } } # endif From 1c34100075364ba09223d8d8689527d49d765c0e Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:06:28 -0400 Subject: [PATCH 19/57] Report max threads per multiprocessor --- src/Cpptraj.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index 65448b67c5..cf8590dc45 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -127,6 +127,7 @@ void Cpptraj::Intro() const { deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); + mprintf("| Max threads/multiprocessor: %i\n", deviceProp.maxThreadsPerMultiProcessor); } } # endif From e69eb394b14e055d39c59cedeb7605ce72143a9f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:12:06 -0400 Subject: [PATCH 20/57] Report major compute capability. --- src/Cpptraj.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index cf8590dc45..f0c388d21b 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -122,6 +122,7 @@ void Cpptraj::Intro() const { mprintf("| CUDA device: %s\n", deviceProp.name); mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); + mprintf("| Major compute capability: %i\n", deviceProp.major); mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); mprintf("| Max threads dim: %i %i %i\n", deviceProp.maxThreadsDim[0], From 77ed890044b3e6cae0fbf627e0dec70f313eae30 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:22:52 -0400 Subject: [PATCH 21/57] Add Gpu namespace to store Gpu related info --- src/Cpptraj.cpp | 2 ++ src/Gpu.cpp | 10 ++++++++++ src/Gpu.h | 8 ++++++++ src/cpptrajdepend | 3 ++- src/cpptrajfiles | 1 + 5 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 src/Gpu.cpp create mode 100644 src/Gpu.h diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index f0c388d21b..830f696e6a 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -8,6 +8,7 @@ #include "Cpptraj.h" #include "CpptrajStdio.h" #include "Command.h" +#include "Gpu.h" #include "ReadLine.h" #include "Version.h" #include "ParmFile.h" // ProcessMask @@ -123,6 +124,7 @@ void Cpptraj::Intro() const { mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); mprintf("| Major compute capability: %i\n", deviceProp.major); + Gpu::SetComputeVersion( deviceProp.major ); mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); mprintf("| Max threads dim: %i %i %i\n", deviceProp.maxThreadsDim[0], diff --git a/src/Gpu.cpp b/src/Gpu.cpp new file mode 100644 index 0000000000..82aafda9bd --- /dev/null +++ b/src/Gpu.cpp @@ -0,0 +1,10 @@ +#ifdef CUDA +#include "Gpu.h" + +static unsigned int cpptraj_gpu_major_ = 0; + +void Gpu::SetComputeVersion(int major) { + if (major > -1) + cpptraj_gpu_major_ = major; +} +#endif /* CUDA */ diff --git a/src/Gpu.h b/src/Gpu.h new file mode 100644 index 0000000000..299b9a84db --- /dev/null +++ b/src/Gpu.h @@ -0,0 +1,8 @@ +#ifndef INC_CPPTRAJ_GPU_H +#define INC_CPPTRAJ_GPU_H +namespace Gpu { +#ifdef CUDA +void SetComputeVersion(int); +#endif +} +#endif diff --git a/src/cpptrajdepend b/src/cpptrajdepend index 4bf312017a..08bf715d8c 100644 --- a/src/cpptrajdepend +++ b/src/cpptrajdepend @@ -188,7 +188,7 @@ ControlBlock_For.o : ControlBlock_For.cpp Action.h ActionList.h ActionState.h An CoordinateInfo.o : CoordinateInfo.cpp Box.h CoordinateInfo.h CpptrajStdio.h Matrix_3x3.h Parallel.h ReplicaDimArray.h Vec3.h Corr.o : Corr.cpp ArrayIterator.h ComplexArray.h Corr.h PubFFT.h Cph.o : Cph.cpp Cph.h NameType.h -Cpptraj.o : Cpptraj.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cmd.h CmdInput.h CmdList.h Command.h Constants.h CoordinateInfo.h Cpptraj.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h HipDefinitions.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ParmFile.h Range.h ReadLine.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h TopInfo.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Version.h +Cpptraj.o : Cpptraj.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cmd.h CmdInput.h CmdList.h Command.h Constants.h CoordinateInfo.h Cpptraj.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h Gpu.h HipDefinitions.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ParmFile.h Range.h ReadLine.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h TopInfo.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Version.h CpptrajFile.o : CpptrajFile.cpp CpptrajFile.h CpptrajStdio.h FileIO.h FileIO_Bzip2.h FileIO_Gzip.h FileIO_Mpi.h FileIO_MpiShared.h FileIO_Std.h FileName.h Parallel.h StringRoutines.h CpptrajState.o : CpptrajState.cpp Action.h ActionList.h ActionState.h Action_CreateCrd.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h CompactFrameArray.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_CRD.h DataSet_Coords_REF.h DataSet_Coords_TRJ.h DataSet_Topology.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleNavigator.h EnsembleOutList.h FileIO.h FileName.h FileTypes.h Frame.h FrameArray.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ProgressBar.h Random.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h TrajFrameIndex.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h CpptrajStdio.o : CpptrajStdio.cpp Parallel.h @@ -338,6 +338,7 @@ ForLoop_overSets.o : ForLoop_overSets.cpp Action.h ActionList.h ActionState.h An Frame.o : Frame.cpp Atom.h AtomMask.h Box.h Constants.h CoordinateInfo.h CpptrajStdio.h Frame.h MaskToken.h Matrix_3x3.h Molecule.h NameType.h Parallel.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h Unit.h Vec3.h GIST_PME.o : GIST_PME.cpp Atom.h AtomMask.h AtomType.h Box.h Constants.h CoordinateInfo.h CpptrajStdio.h EnergyKernel_Adjust.h EnergyKernel_Nonbond.h Ewald.h Ewald_ParticleMesh.h ExclusionArray.h FileName.h Frame.h GIST_PME.h MaskToken.h Matrix_3x3.h Molecule.h NameType.h PairList.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReplicaDimArray.h Residue.h Segment.h SplineFxnTable.h SymbolExporting.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h helpme_standalone.h GistEntropyUtils.o : GistEntropyUtils.cpp GistEntropyUtils.h Vec3.h +Gpu.o : Gpu.cpp Gpu.h GridAction.o : GridAction.cpp ArgList.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataSet.h DataSetList.h DataSet_3D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_GridFlt.h Dimension.h FileIO.h FileName.h Frame.h Grid.h GridAction.h GridBin.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h GridBin.o : GridBin.cpp Box.h CpptrajStdio.h GridBin.h Matrix_3x3.h Parallel.h Vec3.h HistBin.o : HistBin.cpp Constants.h CpptrajStdio.h Dimension.h HistBin.h diff --git a/src/cpptrajfiles b/src/cpptrajfiles index 20c15abef5..2ce3ad56b7 100644 --- a/src/cpptrajfiles +++ b/src/cpptrajfiles @@ -309,6 +309,7 @@ COMMON_SOURCES= \ Frame.cpp \ GistEntropyUtils.cpp \ GIST_PME.cpp \ + Gpu.cpp \ GridAction.cpp \ GridBin.cpp \ HistBin.cpp \ From 87ec305c49041b078685baf7f18625afee9c831f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:32:38 -0400 Subject: [PATCH 22/57] Try to set block size based on compute capability --- src/Cpptraj.cpp | 2 +- src/Gpu.cpp | 11 ++++++++--- src/Gpu.h | 7 ++++--- src/cuda_kernels/cudadepend | 2 +- src/cuda_kernels/kernel_rdf.cu | 5 +++-- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index 830f696e6a..961c28c0e3 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -124,7 +124,7 @@ void Cpptraj::Intro() const { mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); mprintf("| Major compute capability: %i\n", deviceProp.major); - Gpu::SetComputeVersion( deviceProp.major ); + CpptrajGpu::SetComputeVersion( deviceProp.major ); mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); mprintf("| Max threads dim: %i %i %i\n", deviceProp.maxThreadsDim[0], diff --git a/src/Gpu.cpp b/src/Gpu.cpp index 82aafda9bd..f76bce9e10 100644 --- a/src/Gpu.cpp +++ b/src/Gpu.cpp @@ -1,10 +1,15 @@ -#ifdef CUDA #include "Gpu.h" static unsigned int cpptraj_gpu_major_ = 0; -void Gpu::SetComputeVersion(int major) { +void CpptrajGpu::SetComputeVersion(int major) { if (major > -1) cpptraj_gpu_major_ = major; } -#endif /* CUDA */ + +unsigned int CpptrajGpu::MaxBlockDim_2D() { + if (cpptraj_gpu_major_ < 4) + return 16; + else + return 32; +} diff --git a/src/Gpu.h b/src/Gpu.h index 299b9a84db..30e15212c1 100644 --- a/src/Gpu.h +++ b/src/Gpu.h @@ -1,8 +1,9 @@ #ifndef INC_CPPTRAJ_GPU_H #define INC_CPPTRAJ_GPU_H -namespace Gpu { -#ifdef CUDA +namespace CpptrajGpu { +/// Set the major CUDA compute version number void SetComputeVersion(int); -#endif +/// \return Max block dimensions if using 2D blocks +unsigned int MaxBlockDim_2D(); } #endif diff --git a/src/cuda_kernels/cudadepend b/src/cuda_kernels/cudadepend index 7866820ec6..9c8586d128 100644 --- a/src/cuda_kernels/cudadepend +++ b/src/cuda_kernels/cudadepend @@ -1,5 +1,5 @@ GistCudaCalc.o : GistCudaCalc.cu GistCudaCalc.cuh GistCudaSetup.o : GistCudaSetup.cu ../HipDefinitions.h GistCudaCalc.cuh GistCudaSetup.cuh core_kernels.o : core_kernels.cu core_kernels.cuh -kernel_rdf.o : kernel_rdf.cu ../CpptrajStdio.h ../HipDefinitions.h ../ImageOption.h kernel_rdf.cuh +kernel_rdf.o : kernel_rdf.cu ../CpptrajStdio.h ../Gpu.h ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_rdf.cuh kernel_wrappers.o : kernel_wrappers.cu ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_wrappers.cuh diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 112476e657..dce568a270 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -1,13 +1,12 @@ #include "kernel_rdf.cuh" #include "core_kernels.cuh" #include "../CpptrajStdio.h" +#include "../Gpu.h" #if defined(__HIP_PLATFORM_HCC__) #include #include "../HipDefinitions.h" #endif -#define BLOCKDIM 32 - static inline int calc_nblocks(int ntotal, int nthreadsPerBlock) { int nblocks = ntotal / nthreadsPerBlock; @@ -59,6 +58,8 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ } // Determine number of blocks + unsigned int BLOCKDIM = CpptrajGpu::MaxBlockDim_2D(); + dim3 threadsPerBlock(BLOCKDIM, BLOCKDIM); dim3 numBlocks(calc_nblocks(N1, threadsPerBlock.x), calc_nblocks(N2, threadsPerBlock.y)); mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", From 04ef6179244ebb1d5ee4987d55312c3c3c166f47 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 12:41:10 -0400 Subject: [PATCH 23/57] Change recip to frac --- src/cuda_kernels/kernel_rdf.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index dce568a270..5d2f6559ec 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -46,15 +46,15 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); double *boxDev; - double *ucellDev, *recipDev; + double *ucellDev, *fracDev; if (imageType == ImageOption::ORTHO) { Cuda_check(cudaMalloc(((void**)(&boxDev)), 3 * sizeof(double)), "Allocating box"); Cuda_check(cudaMemcpy(boxDev,box, 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying box"); } else if (imageType == ImageOption::NONORTHO) { Cuda_check(cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(double)), "Allocating ucell"); - Cuda_check(cudaMalloc(((void**)(&recipDev)), 9 * sizeof(double)), "Allocating frac"); + Cuda_check(cudaMalloc(((void**)(&fracDev)), 9 * sizeof(double)), "Allocating frac"); Cuda_check(cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying ucell"); - Cuda_check(cudaMemcpy(recipDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying frac"); + Cuda_check(cudaMemcpy(fracDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying frac"); } // Determine number of blocks @@ -69,7 +69,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ switch (imageType) { case ImageOption::NONORTHO: kBinDistances_nonOverlap_nonOrtho<<>>( - device_rdf, device_xyz1, N1, device_xyz2, N2, recipDev, ucellDev, maximum2, one_over_spacing); + device_rdf, device_xyz1, N1, device_xyz2, N2, fracDev, ucellDev, maximum2, one_over_spacing); break; default: mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); @@ -95,7 +95,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ cudaFree(boxDev); else if (imageType == ImageOption::NONORTHO) { cudaFree(ucellDev); - cudaFree(recipDev); + cudaFree(fracDev); } return 0; } From c5ec2e532a8d251170e212c4a0c8f7b36533c398 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 14:32:23 -0400 Subject: [PATCH 24/57] Add rdf with ortho imaging --- src/cuda_kernels/core_kernels.cu | 71 ++++++++++++++++++++++++++++--- src/cuda_kernels/core_kernels.cuh | 3 ++ 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 69e293840d..2bd794fa7e 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -4,10 +4,37 @@ #define RSIZE 512 // ----------------------------------------------------------------------------- -/** \return Shortest imaged distance between given coordinates in fractional space. +/** \return Shortest imaged distance^2 between given coordinates in an orthorhombic box. */ +__device__ double ortho_dist2(double a1x, double a1y, double a1z, + double a2x, double a2y, double a2z, + const double* box) +{ + double x = a1x - a2x; + double y = a1y - a2y; + double z = a1z - a2z; + // Get rid of sign info + if (x<0) x=-x; + if (y<0) y=-y; + if (z<0) z=-z; + // Get rid of multiples of box lengths + while (x > box[0]) x = x - box[0]; + while (y > box[1]) y = y - box[1]; + while (z > box[2]) z = z - box[2]; + // Find shortest distance in periodic reference + double D2 = box[0] - x; + if (D2 < x) x = D2; + D2 = box[1] - y; + if (D2 < y) y = D2; + D2 = box[2] - z; + if (D2 < z) z = D2; + + return (x*x + y*y + z*z); +} + +/** \return Shortest imaged distance^2 between given coordinates in fractional space. * NOTE: This function is complicated hence we will put into a __device__ only function. */ -__device__ double NonOrtho_dist(double a0, double a1, double a2, +__device__ double NonOrtho_dist2(double a0, double a1, double a2, double b0, double b1, double b2, const double *ucell) { @@ -652,7 +679,7 @@ __global__ void kClosestDistsToPt_Nonortho(double* D_, const double* maskCenter, double x = recip[0]*SolventMols_[sIndex + offset + 0] + recip[1]*SolventMols_[sIndex + offset + 1] + recip[2]*SolventMols_[sIndex + offset + 2]; double y = recip[3]*SolventMols_[sIndex + offset + 0] + recip[4]*SolventMols_[sIndex + offset + 1] + recip[5]*SolventMols_[sIndex + offset + 2]; double z = recip[6]*SolventMols_[sIndex + offset + 0] + recip[7]*SolventMols_[sIndex + offset + 1] + recip[8]*SolventMols_[sIndex + offset + 2]; - double dist = NonOrtho_dist(x,y,z,a0,a1,a2,ucell); + double dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); // if (mol == 0) // printf("dist = %f\n",dist); @@ -734,7 +761,7 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, double y = recip[3]*Solute_atoms[j + 0] + recip[4]*Solute_atoms[j + 1] + recip[5]*Solute_atoms[j + 2] ; double z = recip[6]*Solute_atoms[j + 0] + recip[7]*Solute_atoms[j + 1] + recip[8]*Solute_atoms[j + 2] ; - dist = NonOrtho_dist(x,y,z,a0,a1,a2,ucell); + dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); //if (mol == 11) // printf("min = %f\n",min_val); min_val = min(min_val,dist); @@ -773,6 +800,40 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, } // ----------------------------------------------------------------------------- +/** Bin distances from two non-overlapping sets of coords. */ +__global__ void kBinDistances_nonOverlap_Ortho(int* RDF, + const double* xyz1, int N1, const double* xyz2, int N2, + const double* box, + double maximum2, double one_over_spacing) +{ + int a1 = blockIdx.x * blockDim.x + threadIdx.x; + int a2 = blockIdx.y * blockDim.y + threadIdx.y; + + if (a1 < N1 && a2 < N2) { + int idx1 = a1 * 3; + double a1x = xyz1[idx1 ]; + double a1y = xyz1[idx1+1]; + double a1z = xyz1[idx1+2]; + + int idx2 = a2 * 3; + double a2x = xyz2[idx2 ]; + double a2y = xyz2[idx2+1]; + double a2z = xyz2[idx2+2]; + + double dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); + if (dist2 <= maximum2) { + double dist = sqrt(dist2); + int histIdx = (int) (dist * one_over_spacing); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); + //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i xyz1=%f %f %f xyz2=%f %f %f\n", a1+1, a2+1, dist, histIdx, + // a1x, a1y, a1z, a2x, a2y, a2z); + atomicAdd( RDF + histIdx, 1 ); + } + } +} + +/** Bin distances from two non-overlapping sets of coords. */ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, const double* xyz1, int N1, const double* xyz2, int N2, const double* frac, const double* ucell, @@ -798,7 +859,7 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, double f2y = frac[3]*a2x + frac[4]*a2y + frac[5]*a2z; double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; - double dist2 = NonOrtho_dist(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); + double dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); if (dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); diff --git a/src/cuda_kernels/core_kernels.cuh b/src/cuda_kernels/core_kernels.cuh index 2a0bdc127f..a9ed0e3b64 100644 --- a/src/cuda_kernels/core_kernels.cuh +++ b/src/cuda_kernels/core_kernels.cuh @@ -13,6 +13,9 @@ __global__ void kClosestDistsToAtoms_Ortho(double*,const double*,const double*,d // Non-orthorhombic imaging __global__ void kClosestDistsToPt_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int); __global__ void kClosestDistsToAtoms_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int,int); +// RDF ortho imaging +__global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, + const double*, double, double); // RDF nonortho imaging __global__ void kBinDistances_nonOverlap_nonOrtho(int*, const double*, int, const double*, int, const double*, const double*, double, double); From 8edb99b09c16d271b07fe3b18871814fc2157a00 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 14:34:45 -0400 Subject: [PATCH 25/57] Add version with no imaging --- src/cuda_kernels/core_kernels.cu | 32 +++++++++++++++++++++++++++++++ src/cuda_kernels/core_kernels.cuh | 3 +++ 2 files changed, 35 insertions(+) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 2bd794fa7e..238c254de4 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -800,6 +800,38 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, } // ----------------------------------------------------------------------------- +/** Bin distances from two non-overlapping sets of coords. */ +__global__ void kBinDistances_nonOverlap_NoImage(int* RDF, + const double* xyz1, int N1, const double* xyz2, int N2, + double maximum2, double one_over_spacing) +{ + int a1 = blockIdx.x * blockDim.x + threadIdx.x; + int a2 = blockIdx.y * blockDim.y + threadIdx.y; + + if (a1 < N1 && a2 < N2) { + int idx1 = a1 * 3; + double a1x = xyz1[idx1 ]; + double a1y = xyz1[idx1+1]; + double a1z = xyz1[idx1+2]; + + int idx2 = a2 * 3; + double x = a1x - xyz2[idx2 ]; + double y = a1y - xyz2[idx2+1]; + double z = a1z - xyz2[idx2+2]; + + double dist2 = (x*x) + (y*y) + (z*z); + if (dist2 <= maximum2) { + double dist = sqrt(dist2); + int histIdx = (int) (dist * one_over_spacing); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); + //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); + //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i xyz1=%f %f %f xyz2=%f %f %f\n", a1+1, a2+1, dist, histIdx, + // a1x, a1y, a1z, a2x, a2y, a2z); + atomicAdd( RDF + histIdx, 1 ); + } + } +} + /** Bin distances from two non-overlapping sets of coords. */ __global__ void kBinDistances_nonOverlap_Ortho(int* RDF, const double* xyz1, int N1, const double* xyz2, int N2, diff --git a/src/cuda_kernels/core_kernels.cuh b/src/cuda_kernels/core_kernels.cuh index a9ed0e3b64..81df63b7af 100644 --- a/src/cuda_kernels/core_kernels.cuh +++ b/src/cuda_kernels/core_kernels.cuh @@ -13,6 +13,9 @@ __global__ void kClosestDistsToAtoms_Ortho(double*,const double*,const double*,d // Non-orthorhombic imaging __global__ void kClosestDistsToPt_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int); __global__ void kClosestDistsToAtoms_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int,int); +// RDF no imaging +__global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, + double, double); // RDF ortho imaging __global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, const double*, double, double); From 0bca1c57f6496f5b2c110275bf8e37cbaf83d889 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 14:39:18 -0400 Subject: [PATCH 26/57] Enable ortho and no image cases --- src/cuda_kernels/core_kernels.cuh | 2 +- src/cuda_kernels/kernel_rdf.cu | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cuh b/src/cuda_kernels/core_kernels.cuh index 81df63b7af..b166d2536c 100644 --- a/src/cuda_kernels/core_kernels.cuh +++ b/src/cuda_kernels/core_kernels.cuh @@ -14,7 +14,7 @@ __global__ void kClosestDistsToAtoms_Ortho(double*,const double*,const double*,d __global__ void kClosestDistsToPt_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int); __global__ void kClosestDistsToAtoms_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int,int); // RDF no imaging -__global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, +__global__ void kBinDistances_nonOverlap_NoImage(int*, const double*, int, const double*, int, double, double); // RDF ortho imaging __global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 5d2f6559ec..a451f912cd 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -66,14 +66,28 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); // Launch kernel - switch (imageType) { - case ImageOption::NONORTHO: - kBinDistances_nonOverlap_nonOrtho<<>>( - device_rdf, device_xyz1, N1, device_xyz2, N2, fracDev, ucellDev, maximum2, one_over_spacing); + if (xyz2 != 0) { + // Non-overlapping coords + switch (imageType) { + case ImageOption::NONORTHO: + kBinDistances_nonOverlap_nonOrtho<<>>( + device_rdf, device_xyz1, N1, device_xyz2, N2, fracDev, ucellDev, maximum2, one_over_spacing); break; - default: - mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); - return 1; + case ImageOption::ORTHO: + kBinDistances_nonOverlap_Ortho<<>>( + device_rdf, device_xyz1, N1, device_xyz2, N2, boxDev, maximum2, one_over_spacing); + break; + case ImageOption::NO_IMAGE: + kBinDistances_nonOverlap_NoImage<<>>( + device_rdf, device_xyz1, N1, device_xyz2, N2, maximum2, one_over_spacing); + break; + //default: + // mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); + // return 1; + } + } else { + // Overlapping coords + return 1;// FIXME } // Error check Cuda_check(cudaGetLastError(), "kernel launch"); From a0688926e700ced9dd342d37f8c29490e8f5cc4a Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 14:57:44 -0400 Subject: [PATCH 27/57] Attempt to make functions usable with overlapping coords --- src/cuda_kernels/core_kernels.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 238c254de4..d71fdfc662 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -820,7 +820,7 @@ __global__ void kBinDistances_nonOverlap_NoImage(int* RDF, double z = a1z - xyz2[idx2+2]; double dist2 = (x*x) + (y*y) + (z*z); - if (dist2 <= maximum2) { + if (dist2 > 0 && dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); @@ -853,7 +853,7 @@ __global__ void kBinDistances_nonOverlap_Ortho(int* RDF, double a2z = xyz2[idx2+2]; double dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); - if (dist2 <= maximum2) { + if (dist2 > 0 && dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); @@ -892,7 +892,7 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; double dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); - if (dist2 <= maximum2) { + if (dist2 > 0 && dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); From 9a2b476e338301d3e2aab6ee1cdfc497b7172c64 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 15:14:08 -0400 Subject: [PATCH 28/57] Add overlapping test --- test/Test_Radial/Test.sh | 31 ++++++++++++++++++++++++------ test/Test_Radial/watO.agr.save | 28 +++++++++++++++++++++++++++ test/Test_Radial/watO.raw.agr.save | 28 +++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) create mode 100644 test/Test_Radial/watO.agr.save create mode 100644 test/Test_Radial/watO.raw.agr.save diff --git a/test/Test_Radial/Test.sh b/test/Test_Radial/Test.sh index 5e8d2edaf8..69c01eeaba 100755 --- a/test/Test_Radial/Test.sh +++ b/test/Test_Radial/Test.sh @@ -6,24 +6,43 @@ CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ point?.agr wat.origin.agr \ - watO-protein.agr watO-protein.raw.agr + watO-protein.agr watO-protein.raw.agr \ + watO.agr watO.raw.agr TESTNAME='Radial tests' Requires netcdf maxthreads 10 INPUT="-i radial.in" -UNITNAME='Radial test, non-orthogonal imaging' -cat > radial.in < radial.in < radial.in < :WAT@O" +@target G0.S0 +@type xy + 0.250 0.000000 + 0.750 0.000000 + 1.250 0.000000 + 1.750 0.000000 + 2.250 0.004509 + 2.750 1.774147 + 3.250 1.080181 + 3.750 0.920772 + 4.250 0.945101 + 4.750 0.958743 + 5.250 0.961664 + 5.750 0.934807 + 6.250 0.963383 + 6.750 0.978025 + 7.250 0.964305 + 7.750 0.958883 + 8.250 0.947129 + 8.750 0.952076 + 9.250 0.957114 + 9.750 0.961629 diff --git a/test/Test_Radial/watO.raw.agr.save b/test/Test_Radial/watO.raw.agr.save new file mode 100644 index 0000000000..afcb24585d --- /dev/null +++ b/test/Test_Radial/watO.raw.agr.save @@ -0,0 +1,28 @@ +@with g0 +@ xaxis label "Distance (Ang)" +@ yaxis label "" +@ legend 0.2, 0.995 +@ legend char size 0.60 +@ s0 legend "Raw[:WAT@O => :WAT@O]" +@target G0.S0 +@type xy + 0.250 0.000000 + 0.750 0.000000 + 1.250 0.000000 + 1.750 0.000000 + 2.250 18.000000 + 2.750 10566.000000 + 3.250 8978.000000 + 3.750 10184.000000 + 4.250 13422.000000 + 4.750 17004.000000 + 5.250 20832.000000 + 5.750 24288.000000 + 6.250 29570.000000 + 6.750 35012.000000 + 7.250 39822.000000 + 7.750 45246.000000 + 8.250 50642.000000 + 8.750 57262.000000 + 9.250 64330.000000 + 9.750 71808.000000 From d271e89aa7b026908e76df63cbc5cb7ea3d77a83 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 15:16:52 -0400 Subject: [PATCH 29/57] Enable for overlapping coords. --- src/Action_Radial.cpp | 15 ++++++++++++--- src/Action_Radial.h | 1 + src/cuda_kernels/kernel_rdf.cu | 20 ++++++++++++-------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 7be8d941c3..91bb8c3f15 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -20,6 +20,7 @@ Action_Radial::Action_Radial() : currentParm_(0), intramol_distances_(0), useVolume_(false), + mask2_is_mask1_(false), volume_(0), maximum2_(0), spacing_(-1), @@ -139,12 +140,14 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d if (Mask1_.SetMaskString(mask1)) return Action::ERR; // Check for second mask - if none specified use first mask + mask2_is_mask1_ = false; if (needMask2) { std::string mask2 = actionArgs.GetMaskNext(); if (!mask2.empty()) { if (Mask2_.SetMaskString(mask2)) return Action::ERR; } else { if (Mask2_.SetMaskString(mask1)) return Action::ERR; + mask2_is_mask1_ = true; } } // If filename not yet specified check for backwards compat. @@ -483,10 +486,16 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { #ifdef CUDA // Copy atoms FIXME need to do overlapping and non-overlapping case std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); - std::vector innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); + const double* outerxyzPtr = &outerxyz[0]; + std::vector innerxyz; + const double* innerxyzPtr = 0; + if (!mask2_is_mask1_) { + innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); + innerxyzPtr = &innerxyz[0]; + } Cpptraj_GPU_RDF( &RDF_[0], RDF_.size(), maximum2_, one_over_spacing_, - &outerxyz[0], OuterMask_.Nselected(), - &innerxyz[0], InnerMask_.Nselected(), + outerxyzPtr, OuterMask_.Nselected(), + innerxyzPtr, InnerMask_.Nselected(), imageOpt_.ImagingType(), frm.Frm().BoxCrd().XyzPtr(), frm.Frm().BoxCrd().UnitCell().Dptr(), diff --git a/src/Action_Radial.h b/src/Action_Radial.h index 7982d7c636..24928fe903 100644 --- a/src/Action_Radial.h +++ b/src/Action_Radial.h @@ -43,6 +43,7 @@ class Action_Radial: public Action { Topology* currentParm_; ///< Current topology, needed for NO_INTERMOL int intramol_distances_; ///< # of intra-molecular distances for NO_INTERMOL. bool useVolume_; ///< If true normalize based on input volume. + bool mask2_is_mask1_; ///< True is mask1 and mask2 are the same. double volume_; ///< Hold sum of volume for averaging. double maximum2_; ///< Largest distance squared that can be binned. double spacing_; ///< Bin spacing. diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index a451f912cd..4f826c3482 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -42,8 +42,11 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz1"); double* device_xyz2; - Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); - Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); + if (xyz2 != 0) { + Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); + Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); + } else + device_xyz2 = device_xyz1; double *boxDev; double *ucellDev, *fracDev; @@ -66,7 +69,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); // Launch kernel - if (xyz2 != 0) { + //if (xyz2 != 0) { // Non-overlapping coords switch (imageType) { case ImageOption::NONORTHO: @@ -85,10 +88,11 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ // mprinterr("Internal Error: kernel_rdf: Unhandled image type.\n"); // return 1; } - } else { - // Overlapping coords - return 1;// FIXME - } + //} else { + // // Overlapping coords +// +// return 1;// FIXME +// } // Error check Cuda_check(cudaGetLastError(), "kernel launch"); @@ -104,7 +108,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ // Free device memory cudaFree(device_rdf); cudaFree(device_xyz1); - cudaFree(device_xyz2); + if (xyz2 != 0) cudaFree(device_xyz2); if (imageType == ImageOption::ORTHO) cudaFree(boxDev); else if (imageType == ImageOption::NONORTHO) { From 96a1d2b2a08d1ee56061d46f8a98275d2777d4cf Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 15:19:56 -0400 Subject: [PATCH 30/57] Only works for NORMAL mode right now --- src/Action_Radial.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 91bb8c3f15..6068854f49 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -267,6 +267,12 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d mprintf("\tImaging disabled.\n"); if (numthreads_ > 1) mprintf("\tParallelizing RDF calculation with %i threads.\n",numthreads_); +#ifdef CUDA + if (rmode_ != NORMAL) { + mprinterr("Error: RDF calculation on GPU only enabled for regular RDF with 1 or 2 masks.\n"); + return Action::ERR; + } +#endif return Action::OK; } From dc32bb48b474932179d2f955f00f61f594c3d5de Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 15:20:58 -0400 Subject: [PATCH 31/57] Enable WatProt test --- test/Test_Radial/Test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/Test_Radial/Test.sh b/test/Test_Radial/Test.sh index 69c01eeaba..092ffc489d 100755 --- a/test/Test_Radial/Test.sh +++ b/test/Test_Radial/Test.sh @@ -42,6 +42,7 @@ EOF DoTest watO.raw.agr.save watO.raw.agr } +Radial_WatProt_Nonortho Radial_Wat_Nonortho EndTest From 53c0b9b05aa1d615fce797d41361171f15737bbe Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 19:28:18 -0400 Subject: [PATCH 32/57] Ensure CUDA only used for NORMAL, others use CPU calc --- src/Action_Radial.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 6068854f49..7e1dc57cb0 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -268,9 +268,8 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d if (numthreads_ > 1) mprintf("\tParallelizing RDF calculation with %i threads.\n",numthreads_); #ifdef CUDA - if (rmode_ != NORMAL) { - mprinterr("Error: RDF calculation on GPU only enabled for regular RDF with 1 or 2 masks.\n"); - return Action::ERR; + if (rmode_ == NORMAL) { + mprintf("\tRDF calculation will be accelerated with CUDA.\n"); } #endif @@ -489,8 +488,9 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { if (useVolume_) volume_ += frm.Frm().BoxCrd().CellVolume(); // --------------------------------------------- + if ( rmode_ == NORMAL ) { #ifdef CUDA - // Copy atoms FIXME need to do overlapping and non-overlapping case + // Copy atoms for GPU std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); const double* outerxyzPtr = &outerxyz[0]; std::vector innerxyz; @@ -506,9 +506,7 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { frm.Frm().BoxCrd().XyzPtr(), frm.Frm().BoxCrd().UnitCell().Dptr(), frm.Frm().BoxCrd().FracCell().Dptr() ); -#else /* CUDA */ - // --------------------------------------------- - if ( rmode_ == NORMAL ) { +#else /* CUDA */ // Calculation of all atoms in Mask1 to all atoms in Mask2 int outer_max = OuterMask_.Nselected(); int inner_max = InnerMask_.Nselected(); @@ -543,6 +541,7 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { # ifdef _OPENMP } // END pragma omp parallel # endif +#endif /* CUDA */ // --------------------------------------------- } else if ( rmode_ == NO_INTRAMOL ) { // Calculation of all atoms in Mask1 to all atoms in Mask2, ignoring @@ -652,7 +651,6 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { } // END pragma omp parallel # endif } -#endif /* CUDA */ ++numFrames_; return Action::OK; From b550b43bac961e1743d56ebdef97d787aecaddd6 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 8 Jul 2022 19:30:41 -0400 Subject: [PATCH 33/57] Hide some debug info --- src/cuda_kernels/kernel_rdf.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 4f826c3482..62395cbeb1 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -65,8 +65,8 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ dim3 threadsPerBlock(BLOCKDIM, BLOCKDIM); dim3 numBlocks(calc_nblocks(N1, threadsPerBlock.x), calc_nblocks(N2, threadsPerBlock.y)); - mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", - N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); + //mprintf("#Atoms = %i, %i; Threads per block = %i, %i; #Blocks = %i, %i\n", + // N1, N2, threadsPerBlock.x, threadsPerBlock.y, numBlocks.x, numBlocks.y); // Launch kernel //if (xyz2 != 0) { From 4265b9a534cd3e6724a94e2da8b365c29f94f13c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 07:59:11 -0400 Subject: [PATCH 34/57] Add more realistic test for water-protein rdf --- test/Test_Radial/RunTest.sh | 11 +++- test/Test_Radial/tz2.WatO-Prot.agr.save | 68 +++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 test/Test_Radial/tz2.WatO-Prot.agr.save diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index 9d2b00ef9b..804c156f42 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -5,7 +5,7 @@ CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ - point?.agr wat.origin.agr + point?.agr wat.origin.agr tz2.WatO-Prot.agr TESTNAME='Radial tests' Requires netcdf maxthreads 10 @@ -71,6 +71,15 @@ EOF RunCpptraj "$UNITNAME" DoTest wat.origin.agr.save wat.origin.agr +UNITNAME='Radial test, water -> protein' +cat > radial.in < ^1" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.009807 + 1.875 0.090895 + 2.125 0.066963 + 2.375 0.083493 + 2.625 0.219500 + 2.875 0.313730 + 3.125 0.349662 + 3.375 0.378358 + 3.625 0.455962 + 3.875 0.466993 + 4.125 0.460903 + 4.375 0.482029 + 4.625 0.476009 + 4.875 0.527967 + 5.125 0.514599 + 5.375 0.537444 + 5.625 0.541715 + 5.875 0.582414 + 6.125 0.578839 + 6.375 0.613603 + 6.625 0.619699 + 6.875 0.641508 + 7.125 0.657762 + 7.375 0.685561 + 7.625 0.698543 + 7.875 0.699525 + 8.125 0.734832 + 8.375 0.744579 + 8.625 0.762673 + 8.875 0.786471 + 9.125 0.800059 + 9.375 0.818037 + 9.625 0.820301 + 9.875 0.832422 + 10.125 0.848308 + 10.375 0.859424 + 10.625 0.859642 + 10.875 0.868914 + 11.125 0.881561 + 11.375 0.898519 + 11.625 0.896258 + 11.875 0.907679 + 12.125 0.905259 + 12.375 0.919516 + 12.625 0.926709 + 12.875 0.922429 + 13.125 0.932410 + 13.375 0.935351 + 13.625 0.940967 + 13.875 0.944092 + 14.125 0.950874 + 14.375 0.950799 + 14.625 0.958463 + 14.875 0.962108 From 7ba3b45ec3161527100a55196bcbc96619d55076 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 08:03:54 -0400 Subject: [PATCH 35/57] Add more realistic water rdf test --- test/Test_Radial/RunTest.sh | 14 +++++- test/Test_Radial/tz2.WatO.agr.save | 68 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 test/Test_Radial/tz2.WatO.agr.save diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index 804c156f42..5bc69e5f22 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -5,7 +5,7 @@ CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ - point?.agr wat.origin.agr tz2.WatO-Prot.agr + point?.agr wat.origin.agr tz2.WatO-Prot.agr tz2.WatO.agr TESTNAME='Radial tests' Requires netcdf maxthreads 10 @@ -80,6 +80,18 @@ EOF RunCpptraj "$UNITNAME" DoTest tz2.WatO-Prot.agr.save tz2.WatO-Prot.agr +UNITNAME='Radial test, water -> water' +CheckFor maxthreads 2 +if [ $? -eq 0 ] ; then + cat > radial.in < :WAT@O" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.000000 + 1.875 0.000000 + 2.125 0.000000 + 2.375 0.008119 + 2.625 1.302900 + 2.875 2.167052 + 3.125 1.206232 + 3.375 0.972103 + 3.625 0.908561 + 3.875 0.931459 + 4.125 0.954274 + 4.375 0.936945 + 4.625 0.934350 + 4.875 0.980699 + 5.125 0.973067 + 5.375 0.951298 + 5.625 0.926571 + 5.875 0.942356 + 6.125 0.945199 + 6.375 0.980169 + 6.625 0.973968 + 6.875 0.981792 + 7.125 0.970289 + 7.375 0.958719 + 7.625 0.969242 + 7.875 0.949171 + 8.125 0.941500 + 8.375 0.952427 + 8.625 0.952719 + 8.875 0.951468 + 9.125 0.955649 + 9.375 0.958501 + 9.625 0.955208 + 9.875 0.967729 + 10.125 0.956302 + 10.375 0.950518 + 10.625 0.947371 + 10.875 0.941653 + 11.125 0.958531 + 11.375 0.951502 + 11.625 0.948216 + 11.875 0.958844 + 12.125 0.960211 + 12.375 0.948780 + 12.625 0.946982 + 12.875 0.948940 + 13.125 0.948910 + 13.375 0.950781 + 13.625 0.951658 + 13.875 0.946405 + 14.125 0.953586 + 14.375 0.953365 + 14.625 0.956983 + 14.875 0.947460 From 107f6fe9c69e66c9aea5c956a7683197a1253737 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 08:09:55 -0400 Subject: [PATCH 36/57] Add orthogonal tests --- test/Test_Radial/RunTest.sh | 27 +++++++- test/Test_Radial/tz2ortho.WatO-Prot.agr.save | 68 ++++++++++++++++++++ test/Test_Radial/tz2ortho.WatO.agr.save | 68 ++++++++++++++++++++ 3 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 test/Test_Radial/tz2ortho.WatO-Prot.agr.save create mode 100644 test/Test_Radial/tz2ortho.WatO.agr.save diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index 5bc69e5f22..1cefb6cc4f 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -45,9 +45,9 @@ RunCpptraj "$UNITNAME" DoTest WatO-Glu5CD.agr.save WatO-Glu5CD.agr DoTest noimage.WatO-Glu5CD.agr.save noimage.WatO-Glu5CD.agr +UNITNAME='Radial test, specified point' CheckFor maxthreads 1 if [ $? -eq 0 ] ; then - UNITNAME='Radial test, specified point' cat > radial.in < radial.in < radial.in < radial.in < radial.in < ^1" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.008173 + 1.875 0.108091 + 2.125 0.087052 + 2.375 0.086557 + 2.625 0.233297 + 2.875 0.319481 + 3.125 0.377547 + 3.375 0.443251 + 3.625 0.490833 + 3.875 0.482252 + 4.125 0.484533 + 4.375 0.506425 + 4.625 0.503902 + 4.875 0.552527 + 5.125 0.563820 + 5.375 0.570221 + 5.625 0.601981 + 5.875 0.593438 + 6.125 0.615376 + 6.375 0.635202 + 6.625 0.642753 + 6.875 0.662093 + 7.125 0.683146 + 7.375 0.702575 + 7.625 0.734690 + 7.875 0.730136 + 8.125 0.748326 + 8.375 0.770658 + 8.625 0.788658 + 8.875 0.813318 + 9.125 0.814238 + 9.375 0.829501 + 9.625 0.843828 + 9.875 0.849628 + 10.125 0.864379 + 10.375 0.870834 + 10.625 0.883929 + 10.875 0.889172 + 11.125 0.892078 + 11.375 0.907108 + 11.625 0.915906 + 11.875 0.919180 + 12.125 0.922349 + 12.375 0.921380 + 12.625 0.924647 + 12.875 0.938499 + 13.125 0.934368 + 13.375 0.940258 + 13.625 0.948398 + 13.875 0.949439 + 14.125 0.951156 + 14.375 0.959693 + 14.625 0.962870 + 14.875 0.956127 diff --git a/test/Test_Radial/tz2ortho.WatO.agr.save b/test/Test_Radial/tz2ortho.WatO.agr.save new file mode 100644 index 0000000000..0e71d30234 --- /dev/null +++ b/test/Test_Radial/tz2ortho.WatO.agr.save @@ -0,0 +1,68 @@ +@with g0 +@ xaxis label "Distance (Ang)" +@ yaxis label "" +@ legend 0.2, 0.995 +@ legend char size 0.60 +@ s0 legend ":WAT@O => :WAT@O" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.000000 + 1.875 0.000000 + 2.125 0.000000 + 2.375 0.002991 + 2.625 1.330731 + 2.875 2.176120 + 3.125 1.187506 + 3.375 0.923321 + 3.625 0.916463 + 3.875 0.899886 + 4.125 0.962833 + 4.375 0.969469 + 4.625 0.979084 + 4.875 0.961074 + 5.125 0.959835 + 5.375 0.937324 + 5.625 0.933079 + 5.875 0.938541 + 6.125 0.950825 + 6.375 0.952239 + 6.625 0.982928 + 6.875 0.983850 + 7.125 0.992426 + 7.375 0.948645 + 7.625 0.957272 + 7.875 0.941759 + 8.125 0.940810 + 8.375 0.950494 + 8.625 0.948031 + 8.875 0.977353 + 9.125 0.960639 + 9.375 0.942116 + 9.625 0.952574 + 9.875 0.941151 + 10.125 0.951702 + 10.375 0.949590 + 10.625 0.953104 + 10.875 0.951344 + 11.125 0.956692 + 11.375 0.947691 + 11.625 0.954567 + 11.875 0.942745 + 12.125 0.943635 + 12.375 0.953684 + 12.625 0.944898 + 12.875 0.947955 + 13.125 0.948854 + 13.375 0.942730 + 13.625 0.953365 + 13.875 0.944907 + 14.125 0.950638 + 14.375 0.949681 + 14.625 0.947970 + 14.875 0.945455 From 3d911d39725a7d36abd2d76dffaded00756bc546 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 08:14:10 -0400 Subject: [PATCH 37/57] Add more realistic tests with no imaging --- test/Test_Radial/RunTest.sh | 21 ++++++ .../Test_Radial/tz2noimage.WatO-Prot.agr.save | 68 +++++++++++++++++++ test/Test_Radial/tz2noimage.WatO.agr.save | 68 +++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 test/Test_Radial/tz2noimage.WatO-Prot.agr.save create mode 100644 test/Test_Radial/tz2noimage.WatO.agr.save diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index 1cefb6cc4f..24ded92264 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -113,6 +113,27 @@ EOF DoTest tz2ortho.WatO.agr.save tz2ortho.WatO.agr fi +UNITNAME='Radial test, water -> protein, no imaging' +cat > radial.in < radial.in < ^1" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.008173 + 1.875 0.108091 + 2.125 0.087052 + 2.375 0.086557 + 2.625 0.233297 + 2.875 0.319481 + 3.125 0.377547 + 3.375 0.443251 + 3.625 0.490833 + 3.875 0.482252 + 4.125 0.484533 + 4.375 0.506425 + 4.625 0.503902 + 4.875 0.552527 + 5.125 0.563820 + 5.375 0.570221 + 5.625 0.601981 + 5.875 0.593438 + 6.125 0.615376 + 6.375 0.635202 + 6.625 0.642753 + 6.875 0.662093 + 7.125 0.683146 + 7.375 0.702575 + 7.625 0.734615 + 7.875 0.730067 + 8.125 0.748326 + 8.375 0.770473 + 8.625 0.788309 + 8.875 0.812769 + 9.125 0.813718 + 9.375 0.828320 + 9.625 0.842101 + 9.875 0.848165 + 10.125 0.862270 + 10.375 0.867660 + 10.625 0.880404 + 10.875 0.885259 + 11.125 0.886767 + 11.375 0.901493 + 11.625 0.909090 + 11.875 0.909520 + 12.125 0.912789 + 12.375 0.909322 + 12.625 0.909426 + 12.875 0.920812 + 13.125 0.913432 + 13.375 0.916059 + 13.625 0.920234 + 13.875 0.916597 + 14.125 0.913657 + 14.375 0.914007 + 14.625 0.910080 + 14.875 0.896066 diff --git a/test/Test_Radial/tz2noimage.WatO.agr.save b/test/Test_Radial/tz2noimage.WatO.agr.save new file mode 100644 index 0000000000..7e90ef826f --- /dev/null +++ b/test/Test_Radial/tz2noimage.WatO.agr.save @@ -0,0 +1,68 @@ +@with g0 +@ xaxis label "Distance (Ang)" +@ yaxis label "" +@ legend 0.2, 0.995 +@ legend char size 0.60 +@ s0 legend ":WAT@O => :WAT@O" +@target G0.S0 +@type xy + 0.125 0.000000 + 0.375 0.000000 + 0.625 0.000000 + 0.875 0.000000 + 1.125 0.000000 + 1.375 0.000000 + 1.625 0.000000 + 1.875 0.000000 + 2.125 0.000000 + 2.375 0.002991 + 2.625 1.183779 + 2.875 1.914740 + 3.125 1.027328 + 3.375 0.808214 + 3.625 0.783276 + 3.875 0.748094 + 4.125 0.800100 + 4.375 0.794806 + 4.625 0.789635 + 4.875 0.781317 + 5.125 0.768040 + 5.375 0.732558 + 5.625 0.721379 + 5.875 0.719195 + 6.125 0.717358 + 6.375 0.708327 + 6.625 0.722703 + 6.875 0.715408 + 7.125 0.717762 + 7.375 0.670551 + 7.625 0.679104 + 7.875 0.649474 + 8.125 0.638264 + 8.375 0.645514 + 8.625 0.635123 + 8.875 0.634940 + 9.125 0.617597 + 9.375 0.606620 + 9.625 0.601648 + 9.875 0.588718 + 10.125 0.584387 + 10.375 0.571313 + 10.625 0.563295 + 10.875 0.559449 + 11.125 0.554193 + 11.375 0.542936 + 11.625 0.533000 + 11.875 0.521494 + 12.125 0.518285 + 12.375 0.511268 + 12.625 0.503336 + 12.875 0.496952 + 13.125 0.489966 + 13.375 0.483528 + 13.625 0.474893 + 13.875 0.467818 + 14.125 0.463736 + 14.375 0.453005 + 14.625 0.445415 + 14.875 0.437057 From c4d42c8165341ef067a084d456d27c6a67cd7db9 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 08:47:30 -0400 Subject: [PATCH 38/57] Improve CPU radial function when single mask is being used --- src/Action_Radial.cpp | 75 +++++++++++++++++++++++++++++++++++++++++-- src/Action_Radial.h | 4 +++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 7e1dc57cb0..51a3fb8948 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -469,6 +469,73 @@ static inline std::vector mask_to_xyz(AtomMask const& Mask, Frame const& } #endif +void Action_Radial::calcRDF_singleMask(Frame const& frmIn) { + int outer_max = OuterMask_.Nselected(); + int idx1; + + long unsigned* myRDF = &RDF_[0]; +# ifdef _OPENMP +# pragma omp parallel private(idx1, myRDF) + { + //mprintf("OPENMP: %i threads\n",omp_get_num_threads()); + myRDF = &(rdf_thread_[omp_get_thread_num()][0]); +# pragma omp for +# endif + for (idx1 = 0; idx1 < outer_max; idx1++) { + const double* xyz1 = frmIn.XYZ( OuterMask_[idx1] ); + for (int idx2 = idx1 + 1; idx2 < outer_max; idx2++) { + const double* xyz2 = frmIn.XYZ( OuterMask_[idx2] ); + double D2 = DIST2( imageOpt_.ImagingType(), xyz1, xyz2, frmIn.BoxCrd() ); + if (D2 <= maximum2_) { + // NOTE: Can we modify the histogram to store D^2? + double dist = sqrt(D2); + //mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D); + int idx = (int) (dist * one_over_spacing_); + //if (idx > -1 && idx < numBins_) { + myRDF[idx] += 2; + //} + } + } // END inner loop + } // END outer loop +# ifdef _OPENMP + } // END pragma omp parallel +# endif +} + +void Action_Radial::calcRDF_twoMask(Frame const& frmIn) { + int outer_max = OuterMask_.Nselected(); + int inner_max = InnerMask_.Nselected(); + int idx1; + + long unsigned* myRDF = &RDF_[0]; +# ifdef _OPENMP +# pragma omp parallel private(idx1, myRDF) + { + //mprintf("OPENMP: %i threads\n",omp_get_num_threads()); + myRDF = &(rdf_thread_[omp_get_thread_num()][0]); +# pragma omp for +# endif + for (idx1 = 0; idx1 < outer_max; idx1++) { + const double* xyz1 = frmIn.XYZ( OuterMask_[idx1] ); + for (int idx2 = 0; idx2 < inner_max; idx2++) { + const double* xyz2 = frmIn.XYZ( InnerMask_[idx2] ); + double D2 = DIST2( imageOpt_.ImagingType(), xyz1, xyz2, frmIn.BoxCrd() ); + if (D2 <= maximum2_) { + // NOTE: Can we modify the histogram to store D^2? + double dist = sqrt(D2); + //mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D); + int idx = (int) (dist * one_over_spacing_); + //if (idx > -1 && idx < numBins_) { + myRDF[idx]++; + //} + } + } // END inner loop + } // END outer loop +# ifdef _OPENMP + } // END pragma omp parallel +# endif +} + // Action_Radial::DoAction() /** Calculate distances from atoms in mask1 to atoms in mask 2 and * bin them. @@ -508,7 +575,11 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { frm.Frm().BoxCrd().FracCell().Dptr() ); #else /* CUDA */ // Calculation of all atoms in Mask1 to all atoms in Mask2 - int outer_max = OuterMask_.Nselected(); + if (mask2_is_mask1_) + calcRDF_singleMask( frm.Frm() ); + else + calcRDF_twoMask( frm.Frm() ); +/* int outer_max = OuterMask_.Nselected(); int inner_max = InnerMask_.Nselected(); # ifdef _OPENMP # pragma omp parallel private(nmask1,nmask2,atom1,atom2,D,idx,mythread) @@ -540,7 +611,7 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { } // END loop over 1st mask # ifdef _OPENMP } // END pragma omp parallel -# endif +# endif*/ #endif /* CUDA */ // --------------------------------------------- } else if ( rmode_ == NO_INTRAMOL ) { diff --git a/src/Action_Radial.h b/src/Action_Radial.h index 24928fe903..dd58b6eaa2 100644 --- a/src/Action_Radial.h +++ b/src/Action_Radial.h @@ -22,6 +22,10 @@ class Action_Radial: public Action { # endif typedef std::vector Iarray; + void calcRDF_singleMask(Frame const&); + + void calcRDF_twoMask(Frame const&); + # ifdef _OPENMP bool threadsCombined_; ///< True if CombineRdfThreads() has been called. # endif From eb12ee76a5b9c1291e802e325a5a8359f5a040e7 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 13:54:36 -0400 Subject: [PATCH 39/57] Always try to build libcpptraj.cuda --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index b5cd4fb95a..d5384d97ca 100644 --- a/src/Makefile +++ b/src/Makefile @@ -127,7 +127,7 @@ noarpack: @echo "Skipping bundled ARPACK build" @echo "" -cuda_kernels/libcpptraj_cuda.a: +cuda_kernels/libcpptraj_cuda.a:: cd cuda_kernels && $(MAKE) all # Dependency targets From f314034d75a6240c2a29f40b84ec0d845ac8b7b4 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 13:59:57 -0400 Subject: [PATCH 40/57] Start code to store indices for upper triangle matrix --- src/cuda_kernels/kernel_rdf.cu | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 62395cbeb1..d2344787fe 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -42,11 +42,30 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz1"); double* device_xyz2; + int* device_rows1; if (xyz2 != 0) { Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); - } else - device_xyz2 = device_xyz1; + } else { + //device_xyz2 = device_xyz1; + // Create array of upper triangle row start indices TODO long ints? + int* rows1 = new int[ N1 ]; + int i1 = 0; + for (unsigned int row = 0; row < N1; row++) { + for (unsigned int col = row+1; col < N1; col++) { + long int idx = calcTriIndex( N1, col, row ); + if (col == row + 1) { + printf("ROW START: %li\n", idx); + //if (row > 0) Rows1.push_back( idx ); + rows1[i1++] = idx; + //Rows1.push_back( idx ); + } + printf("%8zu %8zu = %8li\n", col, row, idx); + } + } + delete[] rows1; + } + double *boxDev; double *ucellDev, *fracDev; From 315119af6a6a0c7e1b9e37d6b5eee30c7e6fb353 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Sat, 9 Jul 2022 14:53:12 -0400 Subject: [PATCH 41/57] Doesnt seem like flattening works so well. --- src/cuda_kernels/kernel_rdf.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index d2344787fe..3310c7de65 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -42,14 +42,14 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz1"); double* device_xyz2; - int* device_rows1; + //int* device_rows1; if (xyz2 != 0) { Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); } else { - //device_xyz2 = device_xyz1; + device_xyz2 = device_xyz1; // Create array of upper triangle row start indices TODO long ints? - int* rows1 = new int[ N1 ]; +/* int* rows1 = new int[ N1 ]; int i1 = 0; for (unsigned int row = 0; row < N1; row++) { for (unsigned int col = row+1; col < N1; col++) { @@ -63,7 +63,7 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ printf("%8zu %8zu = %8li\n", col, row, idx); } } - delete[] rows1; + delete[] rows1;*/ } From 6c969270b8bd564305e253e694036f077a7ae6ea Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:06:40 -0400 Subject: [PATCH 42/57] Test out using device function template --- src/cuda_kernels/core_kernels.cu | 7 ++++--- src/cuda_kernels/ortho_dist2.cuh | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 src/cuda_kernels/ortho_dist2.cuh diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index d71fdfc662..ce1225c4ce 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -1,11 +1,12 @@ #include "core_kernels.cuh" +#include "ortho_dist2.cuh" //#include // DEBUG #define BLOCKDIM 512 #define RSIZE 512 // ----------------------------------------------------------------------------- /** \return Shortest imaged distance^2 between given coordinates in an orthorhombic box. */ -__device__ double ortho_dist2(double a1x, double a1y, double a1z, +/*__device__ double ortho_dist2(double a1x, double a1y, double a1z, double a2x, double a2y, double a2z, const double* box) { @@ -29,7 +30,7 @@ __device__ double ortho_dist2(double a1x, double a1y, double a1z, if (D2 < z) z = D2; return (x*x + y*y + z*z); -} +}*/ /** \return Shortest imaged distance^2 between given coordinates in fractional space. * NOTE: This function is complicated hence we will put into a __device__ only function. @@ -852,7 +853,7 @@ __global__ void kBinDistances_nonOverlap_Ortho(int* RDF, double a2y = xyz2[idx2+1]; double a2z = xyz2[idx2+2]; - double dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); + double dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); if (dist2 > 0 && dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); diff --git a/src/cuda_kernels/ortho_dist2.cuh b/src/cuda_kernels/ortho_dist2.cuh new file mode 100644 index 0000000000..3b13716121 --- /dev/null +++ b/src/cuda_kernels/ortho_dist2.cuh @@ -0,0 +1,30 @@ +#ifndef INC_ORTHO_DIST2_CUH +#define INC_ORTHO_DIST2_CUH +/** \return Shortest imaged distance^2 between given coordinates in an orthorhombic box. */ +template +__device__ T ortho_dist2(T a1x, T a1y, T a1z, + T a2x, T a2y, T a2z, + const T* box) +{ + T x = a1x - a2x; + T y = a1y - a2y; + T z = a1z - a2z; + // Get rid of sign info + if (x<0) x=-x; + if (y<0) y=-y; + if (z<0) z=-z; + // Get rid of multiples of box lengths + while (x > box[0]) x = x - box[0]; + while (y > box[1]) y = y - box[1]; + while (z > box[2]) z = z - box[2]; + // Find shortest distance in periodic reference + T D2 = box[0] - x; + if (D2 < x) x = D2; + D2 = box[1] - y; + if (D2 < y) y = D2; + D2 = box[2] - z; + if (D2 < z) z = D2; + + return (x*x + y*y + z*z); +} +#endif From 6ca1e9661be8928cb374863ceeca93bea9c3a209 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:12:18 -0400 Subject: [PATCH 43/57] Add template for non-orthogonal distance calc --- src/cuda_kernels/NonOrtho_dist2.cuh | 277 ++++++++++++++++++++++++++++ src/cuda_kernels/core_kernels.cu | 10 +- 2 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 src/cuda_kernels/NonOrtho_dist2.cuh diff --git a/src/cuda_kernels/NonOrtho_dist2.cuh b/src/cuda_kernels/NonOrtho_dist2.cuh new file mode 100644 index 0000000000..a1826cae5f --- /dev/null +++ b/src/cuda_kernels/NonOrtho_dist2.cuh @@ -0,0 +1,277 @@ +#ifndef INC_NONORTHO_DIST2_CUH +#define INC_NONORTHO_DIST2_CUH +/** \return Shortest imaged distance^2 between given coordinates in fractional space. */ +template +__device__ T NonOrtho_dist2(T a0, T a1, T a2, + T b0, T b1, T b2, + const T *ucell) +{ + //int ixyz[3]; + T minIn = -1.0; + + //double closest2 + // The floor() calls serve to bring each point back in the main unit cell. + T fx = a0 - floor(a0); + T fy = a1 - floor(a1); + T fz = a2 - floor(a2); + T f2x = b0 - floor(b0); + T f2y = b1 - floor(b1); + T f2z = b2 - floor(b2); + // f2 back in Cartesian space + T X_factor = (f2x*ucell[0] + f2y*ucell[3] + f2z*ucell[6]); + T Y_factor = (f2x*ucell[1] + f2y*ucell[4] + f2z*ucell[7]); + T Z_factor = (f2x*ucell[2] + f2y*ucell[5] + f2z*ucell[8]); + // Precompute some factors + T fxm1 = fx - 1.0; + T fxp1 = fx + 1.0; + T fym1 = fy - 1.0; + T fyp1 = fy + 1.0; + T fzm1 = fz - 1.0; + T fzp1 = fz + 1.0; + + T fxm1u0 = fxm1 * ucell[0]; + T fxu0 = fx * ucell[0]; + T fxp1u0 = fxp1 * ucell[0]; + T fxm1u1 = fxm1 * ucell[1]; + T fxu1 = fx * ucell[1]; + T fxp1u1 = fxp1 * ucell[1]; + T fxm1u2 = fxm1 * ucell[2]; + T fxu2 = fx * ucell[2]; + T fxp1u2 = fxp1 * ucell[2]; + + T fym1u3 = fym1 * ucell[3]; + T fyu3 = fy * ucell[3]; + T fyp1u3 = fyp1 * ucell[3]; + T fym1u4 = fym1 * ucell[4]; + T fyu4 = fy * ucell[4]; + T fyp1u4 = fyp1 * ucell[4]; + T fym1u5 = fym1 * ucell[5]; + T fyu5 = fy * ucell[5]; + T fyp1u5 = fyp1 * ucell[5]; + + T fzm1u6 = fzm1 * ucell[6]; + T fzu6 = fz * ucell[6]; + T fzp1u6 = fzp1 * ucell[6]; + T fzm1u7 = fzm1 * ucell[7]; + T fzu7 = fz * ucell[7]; + T fzp1u7 = fzp1 * ucell[7]; + T fzm1u8 = fzm1 * ucell[8]; + T fzu8 = fz * ucell[8]; + T fzp1u8 = fzp1 * ucell[8]; + + // Calc ix iy iz = 0 case + T x = (fxu0 + fyu3 + fzu6) - X_factor; + T y = (fxu1 + fyu4 + fzu7) - Y_factor; + T z = (fxu2 + fyu5 + fzu8) - Z_factor; + // DEBUG + //mprintf("DEBUG: a2: %g %g %g\n",(fxu0 + fyu3 + fzu6), (fxu1 + fyu4 + fzu7), (fxu2 + fyu5 + fzu8)); + //mprintf("DEBUG: a1: %g %g %g\n", X_factor, Y_factor, Z_factor); + T min = (x*x) + (y*y) + (z*z); + + if (minIn > 0.0 && minIn < min) min = minIn; + + //ixyz[0] = 0; + //ixyz[1] = 0; + //ixyz[2] = 0; + + // -1 -1 -1 + x = (fxm1u0 + fym1u3 + fzm1u6) - X_factor; + y = (fxm1u1 + fym1u4 + fzm1u7) - Y_factor; + z = (fxm1u2 + fym1u5 + fzm1u8) - Z_factor; + T D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = -1; } + if (D < min) min = D; + // -1 -1 0 + x = (fxm1u0 + fym1u3 + fzu6 ) - X_factor; + y = (fxm1u1 + fym1u4 + fzu7 ) - Y_factor; + z = (fxm1u2 + fym1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = 0; } + if (D < min) min = D; + // -1 -1 +1 + x = (fxm1u0 + fym1u3 + fzp1u6) - X_factor; + y = (fxm1u1 + fym1u4 + fzp1u7) - Y_factor; + z = (fxm1u2 + fym1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = 1; } + if (D < min) min = D; + // -1 0 -1 + x = (fxm1u0 + fyu3 + fzm1u6) - X_factor; + y = (fxm1u1 + fyu4 + fzm1u7) - Y_factor; + z = (fxm1u2 + fyu5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = -1; } + if (D < min) min = D; + // -1 0 0 + x = (fxm1u0 + fyu3 + fzu6 ) - X_factor; + y = (fxm1u1 + fyu4 + fzu7 ) - Y_factor; + z = (fxm1u2 + fyu5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = 0; } + if (D < min) min = D; + // -1 0 +1 + x = (fxm1u0 + fyu3 + fzp1u6) - X_factor; + y = (fxm1u1 + fyu4 + fzp1u7) - Y_factor; + z = (fxm1u2 + fyu5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = 1; } + if (D < min) min = D; + // -1 +1 -1 + x = (fxm1u0 + fyp1u3 + fzm1u6) - X_factor; + y = (fxm1u1 + fyp1u4 + fzm1u7) - Y_factor; + z = (fxm1u2 + fyp1u5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = -1; } + if (D < min) min = D; + // -1 +1 0 + x = (fxm1u0 + fyp1u3 + fzu6 ) - X_factor; + y = (fxm1u1 + fyp1u4 + fzu7 ) - Y_factor; + z = (fxm1u2 + fyp1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = 0; } + if (D < min) min = D; + // -1 +1 +1 + x = (fxm1u0 + fyp1u3 + fzp1u6) - X_factor; + y = (fxm1u1 + fyp1u4 + fzp1u7) - Y_factor; + z = (fxm1u2 + fyp1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = 1; } + if (D < min) min = D; + + // 0 -1 -1 + x = (fxu0 + fym1u3 + fzm1u6) - X_factor; + y = (fxu1 + fym1u4 + fzm1u7) - Y_factor; + z = (fxu2 + fym1u5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = -1; } + if (D < min) min = D; + // 0 -1 0 + x = (fxu0 + fym1u3 + fzu6 ) - X_factor; + y = (fxu1 + fym1u4 + fzu7 ) - Y_factor; + z = (fxu2 + fym1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = 0; } + if (D < min) min = D; + // 0 -1 +1 + x = (fxu0 + fym1u3 + fzp1u6) - X_factor; + y = (fxu1 + fym1u4 + fzp1u7) - Y_factor; + z = (fxu2 + fym1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = 1; } + if (D < min) min = D; + // 0 0 -1 + x = (fxu0 + fyu3 + fzm1u6) - X_factor; + y = (fxu1 + fyu4 + fzm1u7) - Y_factor; + z = (fxu2 + fyu5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 0; ixyz[2] = -1; } + if (D < min) min = D; + // 0 0 0 + // 0 0 +1 + x = (fxu0 + fyu3 + fzp1u6) - X_factor; + y = (fxu1 + fyu4 + fzp1u7) - Y_factor; + z = (fxu2 + fyu5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 0; ixyz[2] = 1; } + if (D < min) min = D; + // 0 +1 -1 + x = (fxu0 + fyp1u3 + fzm1u6) - X_factor; + y = (fxu1 + fyp1u4 + fzm1u7) - Y_factor; + z = (fxu2 + fyp1u5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = -1; } + if (D < min) min = D; + // 0 +1 0 + x = (fxu0 + fyp1u3 + fzu6 ) - X_factor; + y = (fxu1 + fyp1u4 + fzu7 ) - Y_factor; + z = (fxu2 + fyp1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = 0; } + if (D < min) min = D; + // 0 +1 +1 + x = (fxu0 + fyp1u3 + fzp1u6) - X_factor; + y = (fxu1 + fyp1u4 + fzp1u7) - Y_factor; + z = (fxu2 + fyp1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = 1; } + if (D < min) min = D; + + // +1 -1 -1 + x = (fxp1u0 + fym1u3 + fzm1u6) - X_factor; + y = (fxp1u1 + fym1u4 + fzm1u7) - Y_factor; + z = (fxp1u2 + fym1u5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = -1; } + if (D < min) min = D; + // +1 -1 0 + x = (fxp1u0 + fym1u3 + fzu6 ) - X_factor; + y = (fxp1u1 + fym1u4 + fzu7 ) - Y_factor; + z = (fxp1u2 + fym1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = 0; } + if (D < min) min = D; + // +1 -1 +1 + x = (fxp1u0 + fym1u3 + fzp1u6) - X_factor; + y = (fxp1u1 + fym1u4 + fzp1u7) - Y_factor; + z = (fxp1u2 + fym1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = 1; } + if (D < min) min = D; + // +1 0 -1 + x = (fxp1u0 + fyu3 + fzm1u6) - X_factor; + y = (fxp1u1 + fyu4 + fzm1u7) - Y_factor; + z = (fxp1u2 + fyu5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = -1; } + if (D < min) min = D; + // +1 0 0 + x = (fxp1u0 + fyu3 + fzu6 ) - X_factor; + y = (fxp1u1 + fyu4 + fzu7 ) - Y_factor; + z = (fxp1u2 + fyu5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = 0; } + if (D < min) min = D; + // +1 0 +1 + x = (fxp1u0 + fyu3 + fzp1u6) - X_factor; + y = (fxp1u1 + fyu4 + fzp1u7) - Y_factor; + z = (fxp1u2 + fyu5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = 1; } + if (D < min) min = D; + // +1 +1 -1 + x = (fxp1u0 + fyp1u3 + fzm1u6) - X_factor; + y = (fxp1u1 + fyp1u4 + fzm1u7) - Y_factor; + z = (fxp1u2 + fyp1u5 + fzm1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = -1; } + if (D < min) min = D; + // +1 +1 0 + x = (fxp1u0 + fyp1u3 + fzu6 ) - X_factor; + y = (fxp1u1 + fyp1u4 + fzu7 ) - Y_factor; + z = (fxp1u2 + fyp1u5 + fzu8 ) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = 0; } + if (D < min) min = D; + // +1 +1 +1 + x = (fxp1u0 + fyp1u3 + fzp1u6) - X_factor; + y = (fxp1u1 + fyp1u4 + fzp1u7) - Y_factor; + z = (fxp1u2 + fyp1u5 + fzp1u8) - Z_factor; + D = (x*x) + (y*y) + (z*z); + //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = 1; } + if (D < min) min = D; + + //if (closest2 != 0.0 && min < closest2) return (min); +// this->ClosestImage(a1, a2, ixyz); +// fprintf(stdout,"DEBUG: Predict = %2i %2i %2i\n",ixyz[0],ixyz[1],ixyz[2]); + +// ix = ixyz[0]; +// iy = ixyz[1]; +// iz = ixyz[2]; + +//D = sqrt(min); +// fprintf(stdout,"DEBUG: MinDist = %2i %2i %2i = %8.3f\n", ixmin, iymin, izmin, D); +// printf("---------------------------------------------------------------\n"); + return(min); + +} +#endif diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index ce1225c4ce..4607ff62c3 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -1,4 +1,5 @@ #include "core_kernels.cuh" +#include "NonOrtho_dist2.cuh" #include "ortho_dist2.cuh" //#include // DEBUG #define BLOCKDIM 512 @@ -35,6 +36,7 @@ /** \return Shortest imaged distance^2 between given coordinates in fractional space. * NOTE: This function is complicated hence we will put into a __device__ only function. */ +/* __device__ double NonOrtho_dist2(double a0, double a1, double a2, double b0, double b1, double b2, const double *ucell) @@ -307,7 +309,7 @@ __device__ double NonOrtho_dist2(double a0, double a1, double a2, return(min); } - +*/ // ----------------------------------------------------------------------------- //try thread coarsening /** Calculate the closest distances of atoms of solvent molecules to a point. */ @@ -680,7 +682,7 @@ __global__ void kClosestDistsToPt_Nonortho(double* D_, const double* maskCenter, double x = recip[0]*SolventMols_[sIndex + offset + 0] + recip[1]*SolventMols_[sIndex + offset + 1] + recip[2]*SolventMols_[sIndex + offset + 2]; double y = recip[3]*SolventMols_[sIndex + offset + 0] + recip[4]*SolventMols_[sIndex + offset + 1] + recip[5]*SolventMols_[sIndex + offset + 2]; double z = recip[6]*SolventMols_[sIndex + offset + 0] + recip[7]*SolventMols_[sIndex + offset + 1] + recip[8]*SolventMols_[sIndex + offset + 2]; - double dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); + double dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); // if (mol == 0) // printf("dist = %f\n",dist); @@ -762,7 +764,7 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, double y = recip[3]*Solute_atoms[j + 0] + recip[4]*Solute_atoms[j + 1] + recip[5]*Solute_atoms[j + 2] ; double z = recip[6]*Solute_atoms[j + 0] + recip[7]*Solute_atoms[j + 1] + recip[8]*Solute_atoms[j + 2] ; - dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); + dist = NonOrtho_dist2(x,y,z,a0,a1,a2,ucell); //if (mol == 11) // printf("min = %f\n",min_val); min_val = min(min_val,dist); @@ -892,7 +894,7 @@ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, double f2y = frac[3]*a2x + frac[4]*a2y + frac[5]*a2z; double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; - double dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); + double dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); if (dist2 > 0 && dist2 <= maximum2) { double dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); From 54240139d60280d72847ee00b7cee5ac2e86ec2c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:21:00 -0400 Subject: [PATCH 44/57] Use the ortho dist template in the closest kernels --- src/cuda_kernels/core_kernels.cu | 318 ++----------------------------- 1 file changed, 11 insertions(+), 307 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 4607ff62c3..a48f03a7a2 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -5,311 +5,6 @@ #define BLOCKDIM 512 #define RSIZE 512 -// ----------------------------------------------------------------------------- -/** \return Shortest imaged distance^2 between given coordinates in an orthorhombic box. */ -/*__device__ double ortho_dist2(double a1x, double a1y, double a1z, - double a2x, double a2y, double a2z, - const double* box) -{ - double x = a1x - a2x; - double y = a1y - a2y; - double z = a1z - a2z; - // Get rid of sign info - if (x<0) x=-x; - if (y<0) y=-y; - if (z<0) z=-z; - // Get rid of multiples of box lengths - while (x > box[0]) x = x - box[0]; - while (y > box[1]) y = y - box[1]; - while (z > box[2]) z = z - box[2]; - // Find shortest distance in periodic reference - double D2 = box[0] - x; - if (D2 < x) x = D2; - D2 = box[1] - y; - if (D2 < y) y = D2; - D2 = box[2] - z; - if (D2 < z) z = D2; - - return (x*x + y*y + z*z); -}*/ - -/** \return Shortest imaged distance^2 between given coordinates in fractional space. - * NOTE: This function is complicated hence we will put into a __device__ only function. - */ -/* -__device__ double NonOrtho_dist2(double a0, double a1, double a2, - double b0, double b1, double b2, - const double *ucell) -{ - //int ixyz[3]; - double minIn = -1.0; - - //double closest2 - // The floor() calls serve to bring each point back in the main unit cell. - double fx = a0 - floor(a0); - double fy = a1 - floor(a1); - double fz = a2 - floor(a2); - double f2x = b0 - floor(b0); - double f2y = b1 - floor(b1); - double f2z = b2 - floor(b2); - // f2 back in Cartesian space - double X_factor = (f2x*ucell[0] + f2y*ucell[3] + f2z*ucell[6]); - double Y_factor = (f2x*ucell[1] + f2y*ucell[4] + f2z*ucell[7]); - double Z_factor = (f2x*ucell[2] + f2y*ucell[5] + f2z*ucell[8]); - // Precompute some factors - double fxm1 = fx - 1.0; - double fxp1 = fx + 1.0; - double fym1 = fy - 1.0; - double fyp1 = fy + 1.0; - double fzm1 = fz - 1.0; - double fzp1 = fz + 1.0; - - double fxm1u0 = fxm1 * ucell[0]; - double fxu0 = fx * ucell[0]; - double fxp1u0 = fxp1 * ucell[0]; - double fxm1u1 = fxm1 * ucell[1]; - double fxu1 = fx * ucell[1]; - double fxp1u1 = fxp1 * ucell[1]; - double fxm1u2 = fxm1 * ucell[2]; - double fxu2 = fx * ucell[2]; - double fxp1u2 = fxp1 * ucell[2]; - - double fym1u3 = fym1 * ucell[3]; - double fyu3 = fy * ucell[3]; - double fyp1u3 = fyp1 * ucell[3]; - double fym1u4 = fym1 * ucell[4]; - double fyu4 = fy * ucell[4]; - double fyp1u4 = fyp1 * ucell[4]; - double fym1u5 = fym1 * ucell[5]; - double fyu5 = fy * ucell[5]; - double fyp1u5 = fyp1 * ucell[5]; - - double fzm1u6 = fzm1 * ucell[6]; - double fzu6 = fz * ucell[6]; - double fzp1u6 = fzp1 * ucell[6]; - double fzm1u7 = fzm1 * ucell[7]; - double fzu7 = fz * ucell[7]; - double fzp1u7 = fzp1 * ucell[7]; - double fzm1u8 = fzm1 * ucell[8]; - double fzu8 = fz * ucell[8]; - double fzp1u8 = fzp1 * ucell[8]; - - // Calc ix iy iz = 0 case - double x = (fxu0 + fyu3 + fzu6) - X_factor; - double y = (fxu1 + fyu4 + fzu7) - Y_factor; - double z = (fxu2 + fyu5 + fzu8) - Z_factor; - // DEBUG - //mprintf("DEBUG: a2: %g %g %g\n",(fxu0 + fyu3 + fzu6), (fxu1 + fyu4 + fzu7), (fxu2 + fyu5 + fzu8)); - //mprintf("DEBUG: a1: %g %g %g\n", X_factor, Y_factor, Z_factor); - double min = (x*x) + (y*y) + (z*z); - - if (minIn > 0.0 && minIn < min) min = minIn; - - //ixyz[0] = 0; - //ixyz[1] = 0; - //ixyz[2] = 0; - - // -1 -1 -1 - x = (fxm1u0 + fym1u3 + fzm1u6) - X_factor; - y = (fxm1u1 + fym1u4 + fzm1u7) - Y_factor; - z = (fxm1u2 + fym1u5 + fzm1u8) - Z_factor; - double D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = -1; } - if (D < min) min = D; - // -1 -1 0 - x = (fxm1u0 + fym1u3 + fzu6 ) - X_factor; - y = (fxm1u1 + fym1u4 + fzu7 ) - Y_factor; - z = (fxm1u2 + fym1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = 0; } - if (D < min) min = D; - // -1 -1 +1 - x = (fxm1u0 + fym1u3 + fzp1u6) - X_factor; - y = (fxm1u1 + fym1u4 + fzp1u7) - Y_factor; - z = (fxm1u2 + fym1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = -1; ixyz[2] = 1; } - if (D < min) min = D; - // -1 0 -1 - x = (fxm1u0 + fyu3 + fzm1u6) - X_factor; - y = (fxm1u1 + fyu4 + fzm1u7) - Y_factor; - z = (fxm1u2 + fyu5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = -1; } - if (D < min) min = D; - // -1 0 0 - x = (fxm1u0 + fyu3 + fzu6 ) - X_factor; - y = (fxm1u1 + fyu4 + fzu7 ) - Y_factor; - z = (fxm1u2 + fyu5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = 0; } - if (D < min) min = D; - // -1 0 +1 - x = (fxm1u0 + fyu3 + fzp1u6) - X_factor; - y = (fxm1u1 + fyu4 + fzp1u7) - Y_factor; - z = (fxm1u2 + fyu5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 0; ixyz[2] = 1; } - if (D < min) min = D; - // -1 +1 -1 - x = (fxm1u0 + fyp1u3 + fzm1u6) - X_factor; - y = (fxm1u1 + fyp1u4 + fzm1u7) - Y_factor; - z = (fxm1u2 + fyp1u5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = -1; } - if (D < min) min = D; - // -1 +1 0 - x = (fxm1u0 + fyp1u3 + fzu6 ) - X_factor; - y = (fxm1u1 + fyp1u4 + fzu7 ) - Y_factor; - z = (fxm1u2 + fyp1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = 0; } - if (D < min) min = D; - // -1 +1 +1 - x = (fxm1u0 + fyp1u3 + fzp1u6) - X_factor; - y = (fxm1u1 + fyp1u4 + fzp1u7) - Y_factor; - z = (fxm1u2 + fyp1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = -1; ixyz[1] = 1; ixyz[2] = 1; } - if (D < min) min = D; - - // 0 -1 -1 - x = (fxu0 + fym1u3 + fzm1u6) - X_factor; - y = (fxu1 + fym1u4 + fzm1u7) - Y_factor; - z = (fxu2 + fym1u5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = -1; } - if (D < min) min = D; - // 0 -1 0 - x = (fxu0 + fym1u3 + fzu6 ) - X_factor; - y = (fxu1 + fym1u4 + fzu7 ) - Y_factor; - z = (fxu2 + fym1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = 0; } - if (D < min) min = D; - // 0 -1 +1 - x = (fxu0 + fym1u3 + fzp1u6) - X_factor; - y = (fxu1 + fym1u4 + fzp1u7) - Y_factor; - z = (fxu2 + fym1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = -1; ixyz[2] = 1; } - if (D < min) min = D; - // 0 0 -1 - x = (fxu0 + fyu3 + fzm1u6) - X_factor; - y = (fxu1 + fyu4 + fzm1u7) - Y_factor; - z = (fxu2 + fyu5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 0; ixyz[2] = -1; } - if (D < min) min = D; - // 0 0 0 - // 0 0 +1 - x = (fxu0 + fyu3 + fzp1u6) - X_factor; - y = (fxu1 + fyu4 + fzp1u7) - Y_factor; - z = (fxu2 + fyu5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 0; ixyz[2] = 1; } - if (D < min) min = D; - // 0 +1 -1 - x = (fxu0 + fyp1u3 + fzm1u6) - X_factor; - y = (fxu1 + fyp1u4 + fzm1u7) - Y_factor; - z = (fxu2 + fyp1u5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = -1; } - if (D < min) min = D; - // 0 +1 0 - x = (fxu0 + fyp1u3 + fzu6 ) - X_factor; - y = (fxu1 + fyp1u4 + fzu7 ) - Y_factor; - z = (fxu2 + fyp1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = 0; } - if (D < min) min = D; - // 0 +1 +1 - x = (fxu0 + fyp1u3 + fzp1u6) - X_factor; - y = (fxu1 + fyp1u4 + fzp1u7) - Y_factor; - z = (fxu2 + fyp1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 0; ixyz[1] = 1; ixyz[2] = 1; } - if (D < min) min = D; - - // +1 -1 -1 - x = (fxp1u0 + fym1u3 + fzm1u6) - X_factor; - y = (fxp1u1 + fym1u4 + fzm1u7) - Y_factor; - z = (fxp1u2 + fym1u5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = -1; } - if (D < min) min = D; - // +1 -1 0 - x = (fxp1u0 + fym1u3 + fzu6 ) - X_factor; - y = (fxp1u1 + fym1u4 + fzu7 ) - Y_factor; - z = (fxp1u2 + fym1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = 0; } - if (D < min) min = D; - // +1 -1 +1 - x = (fxp1u0 + fym1u3 + fzp1u6) - X_factor; - y = (fxp1u1 + fym1u4 + fzp1u7) - Y_factor; - z = (fxp1u2 + fym1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = -1; ixyz[2] = 1; } - if (D < min) min = D; - // +1 0 -1 - x = (fxp1u0 + fyu3 + fzm1u6) - X_factor; - y = (fxp1u1 + fyu4 + fzm1u7) - Y_factor; - z = (fxp1u2 + fyu5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = -1; } - if (D < min) min = D; - // +1 0 0 - x = (fxp1u0 + fyu3 + fzu6 ) - X_factor; - y = (fxp1u1 + fyu4 + fzu7 ) - Y_factor; - z = (fxp1u2 + fyu5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = 0; } - if (D < min) min = D; - // +1 0 +1 - x = (fxp1u0 + fyu3 + fzp1u6) - X_factor; - y = (fxp1u1 + fyu4 + fzp1u7) - Y_factor; - z = (fxp1u2 + fyu5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 0; ixyz[2] = 1; } - if (D < min) min = D; - // +1 +1 -1 - x = (fxp1u0 + fyp1u3 + fzm1u6) - X_factor; - y = (fxp1u1 + fyp1u4 + fzm1u7) - Y_factor; - z = (fxp1u2 + fyp1u5 + fzm1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = -1; } - if (D < min) min = D; - // +1 +1 0 - x = (fxp1u0 + fyp1u3 + fzu6 ) - X_factor; - y = (fxp1u1 + fyp1u4 + fzu7 ) - Y_factor; - z = (fxp1u2 + fyp1u5 + fzu8 ) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = 0; } - if (D < min) min = D; - // +1 +1 +1 - x = (fxp1u0 + fyp1u3 + fzp1u6) - X_factor; - y = (fxp1u1 + fyp1u4 + fzp1u7) - Y_factor; - z = (fxp1u2 + fyp1u5 + fzp1u8) - Z_factor; - D = (x*x) + (y*y) + (z*z); - //if (D < min) { min = D; ixyz[0] = 1; ixyz[1] = 1; ixyz[2] = 1; } - if (D < min) min = D; - - //if (closest2 != 0.0 && min < closest2) return (min); -// this->ClosestImage(a1, a2, ixyz); -// fprintf(stdout,"DEBUG: Predict = %2i %2i %2i\n",ixyz[0],ixyz[1],ixyz[2]); - -// ix = ixyz[0]; -// iy = ixyz[1]; -// iz = ixyz[2]; - -//D = sqrt(min); -// fprintf(stdout,"DEBUG: MinDist = %2i %2i %2i = %8.3f\n", ixmin, iymin, izmin, D); -// printf("---------------------------------------------------------------\n"); - return(min); - -} -*/ // ----------------------------------------------------------------------------- //try thread coarsening /** Calculate the closest distances of atoms of solvent molecules to a point. */ @@ -487,6 +182,11 @@ __global__ void kClosestDistsToPt_Ortho(double *D_, const double* maskCenter, double dist; for(int offset = 0 ; offset < NAtoms ; offset++ ) { + dist = ortho_dist2( a0, a1, a2, + SolventMols_[sIndex], SolventMols_[sIndex+1], SolventMols_[sIndex+2], + box ); + sIndex += 3; +/* double x = a0 - SolventMols_[sIndex++]; double y = a1 - SolventMols_[sIndex++]; double z = a2 - SolventMols_[sIndex++]; @@ -509,7 +209,7 @@ __global__ void kClosestDistsToPt_Ortho(double *D_, const double* maskCenter, if (D < z) z = D; //Dist = x*x + y*y + z*z; - dist = x*x + y*y + z*z; + dist = x*x + y*y + z*z;*/ if (box[0]==0.0 || box[1]==0.0 || box[2]==0.0) dist= -1.0; @@ -582,6 +282,10 @@ __global__ void kClosestDistsToAtoms_Ortho(double* D_, const double* SolventMols for (j = start ; j < end; j+=3 ) { //int offset = start + (j + threadIdx.x)%(end - start); + dist = ortho_dist2( Solute_atoms[j], Solute_atoms[j+1], Solute_atoms[j+2], + a0, a1, a2, + box ); +/* double x = Solute_atoms[j + 0] - a0; double y = Solute_atoms[j + 1] - a1; double z = Solute_atoms[j + 2] - a2; @@ -609,7 +313,7 @@ __global__ void kClosestDistsToAtoms_Ortho(double* D_, const double* SolventMols if (D < z) z = D; //Dist = x*x + y*y + z*z; - dist = x*x + y*y + z*z; + dist = x*x + y*y + z*z;*/ if (box[0]==0.0 || box[1]==0.0 || box[2]==0.0) dist = -1.0; From f11d1ce46e63eecf043ddaafa4dd50f570f6358d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:23:33 -0400 Subject: [PATCH 45/57] Remove old code. Add note about slower ortho calc method --- src/cuda_kernels/core_kernels.cu | 54 +------------------------------- src/cuda_kernels/ortho_dist2.cuh | 4 +++ 2 files changed, 5 insertions(+), 53 deletions(-) diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index a48f03a7a2..6815c73477 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -47,7 +47,6 @@ __global__ void kClosestDistsToPt_NoImage(double* D_, const double* maskCenter, } } - // ----------------------------------------------------------------------------- /** Calculate closest distances of atoms of solvent molecules to solute atoms. */ @@ -186,30 +185,7 @@ __global__ void kClosestDistsToPt_Ortho(double *D_, const double* maskCenter, SolventMols_[sIndex], SolventMols_[sIndex+1], SolventMols_[sIndex+2], box ); sIndex += 3; -/* - double x = a0 - SolventMols_[sIndex++]; - double y = a1 - SolventMols_[sIndex++]; - double z = a2 - SolventMols_[sIndex++]; - - // Get rid of sign info - if (x<0) x=-x; - if (y<0) y=-y; - if (z<0) z=-z; - // Get rid of multiples of box lengths - //TODO WIERD that should be a way to simplify it - while (x > box[0]) x = x - box[0]; - while (y > box[1]) y = y - box[1]; - while (z > box[2]) z = z - box[2]; - // Find shortest distance in periodic reference - double D = box[0] - x; - if (D < x) x = D; - D = box[1] - y; - if (D < y) y = D; - D = box[2] - z; - if (D < z) z = D; - - //Dist = x*x + y*y + z*z; - dist = x*x + y*y + z*z;*/ + if (box[0]==0.0 || box[1]==0.0 || box[2]==0.0) dist= -1.0; @@ -285,35 +261,7 @@ __global__ void kClosestDistsToAtoms_Ortho(double* D_, const double* SolventMols dist = ortho_dist2( Solute_atoms[j], Solute_atoms[j+1], Solute_atoms[j+2], a0, a1, a2, box ); -/* - double x = Solute_atoms[j + 0] - a0; - double y = Solute_atoms[j + 1] - a1; - double z = Solute_atoms[j + 2] - a2; - // Get rid of sign info - if (x<0) x=-x; - if (y<0) y=-y; - if (z<0) z=-z; - // Get rid of multiples of box lengths - //TODO WIERD that should be a way to simplify it - while (x > box[0]) x = x - box[0]; - while (y > box[1]) y = y - box[1]; - while (z > box[2]) z = z - box[2]; - - //below is actually slower! - //x = x - box[0]*((int)x/box[0]); - //y = y - box[0]*((int)y/box[1]); - //z = z - box[0]*((int)z/box[2]); - // Find shortest distance in periodic reference - double D = box[0] - x; - if (D < x) x = D; - D = box[1] - y; - if (D < y) y = D; - D = box[2] - z; - if (D < z) z = D; - - //Dist = x*x + y*y + z*z; - dist = x*x + y*y + z*z;*/ if (box[0]==0.0 || box[1]==0.0 || box[2]==0.0) dist = -1.0; diff --git a/src/cuda_kernels/ortho_dist2.cuh b/src/cuda_kernels/ortho_dist2.cuh index 3b13716121..c7ba2e8210 100644 --- a/src/cuda_kernels/ortho_dist2.cuh +++ b/src/cuda_kernels/ortho_dist2.cuh @@ -17,6 +17,10 @@ __device__ T ortho_dist2(T a1x, T a1y, T a1z, while (x > box[0]) x = x - box[0]; while (y > box[1]) y = y - box[1]; while (z > box[2]) z = z - box[2]; + //below is actually slower! + //x = x - box[0]*((int)x/box[0]); + //y = y - box[0]*((int)y/box[1]); + //z = z - box[0]*((int)z/box[2]); // Find shortest distance in periodic reference T D2 = box[0] - x; if (D2 < x) x = D2; From 8e1b7343e6ddfa43fafa9efba950b5251ca070ca Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:29:08 -0400 Subject: [PATCH 46/57] Start adding a gpu floating point type definition --- src/Action_Radial.cpp | 19 ++++++++++--------- src/Gpu.h | 2 ++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 51a3fb8948..ded0c1646f 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -8,6 +8,7 @@ # include #endif #ifdef CUDA +# include "Gpu.h" # include "cuda_kernels/kernel_rdf.cuh" #endif @@ -455,15 +456,15 @@ Action::RetType Action_Radial::Setup(ActionSetup& setup) { } #ifdef CUDA -static inline std::vector mask_to_xyz(AtomMask const& Mask, Frame const& frm) +static inline std::vector mask_to_xyz(AtomMask const& Mask, Frame const& frm) { - std::vector outerxyz; + std::vector outerxyz; outerxyz.reserve( Mask.Nselected()*3 ); for (AtomMask::const_iterator at = Mask.begin(); at != Mask.end(); ++at) { const double* xyz = frm.XYZ( *at ); - outerxyz.push_back( xyz[0] ); - outerxyz.push_back( xyz[1] ); - outerxyz.push_back( xyz[2] ); + outerxyz.push_back( (CpptrajGpu::FpType)xyz[0] ); + outerxyz.push_back( (CpptrajGpu::FpType)xyz[1] ); + outerxyz.push_back( (CpptrajGpu::FpType)xyz[2] ); } return outerxyz; } @@ -558,10 +559,10 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { if ( rmode_ == NORMAL ) { #ifdef CUDA // Copy atoms for GPU - std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); - const double* outerxyzPtr = &outerxyz[0]; - std::vector innerxyz; - const double* innerxyzPtr = 0; + std::vector outerxyz = mask_to_xyz(OuterMask_, frm.Frm()); + const CpptrajGpu::FpType* outerxyzPtr = &outerxyz[0]; + std::vector innerxyz; + const CpptrajGpu::FpType* innerxyzPtr = 0; if (!mask2_is_mask1_) { innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); innerxyzPtr = &innerxyz[0]; diff --git a/src/Gpu.h b/src/Gpu.h index 30e15212c1..c6a433cab4 100644 --- a/src/Gpu.h +++ b/src/Gpu.h @@ -5,5 +5,7 @@ namespace CpptrajGpu { void SetComputeVersion(int); /// \return Max block dimensions if using 2D blocks unsigned int MaxBlockDim_2D(); +/// Default floating point type +typedef double FpType; } #endif From 163e1d42aa2c7f37b7ff316194a5034b9358ed2d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:42:45 -0400 Subject: [PATCH 47/57] Use CpptrajGpu::FpType for GPU radial calcs. --- src/Action_Radial.cpp | 15 ++++-- src/cpptrajdepend | 2 +- src/cuda_kernels/core_kernels.cu | 78 +++++++++++++++---------------- src/cuda_kernels/core_kernels.cuh | 13 +++--- src/cuda_kernels/cudadepend | 4 +- src/cuda_kernels/kernel_rdf.cu | 37 +++++++-------- src/cuda_kernels/kernel_rdf.cuh | 7 ++- 7 files changed, 84 insertions(+), 72 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index ded0c1646f..7be765cf05 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -567,13 +567,22 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { innerxyz = mask_to_xyz(InnerMask_, frm.Frm()); innerxyzPtr = &innerxyz[0]; } + CpptrajGpu::FpType gpu_box[3]; + gpu_box[0] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::X); + gpu_box[1] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::Y); + gpu_box[2] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::Z); + CpptrajGpu::FpType gpu_ucell[9], gpu_frac[9]; + for (int ibox = 0; ibox != 9; ibox++) { + gpu_ucell[ibox] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().UnitCell()[ibox]; + gpu_frac[ibox] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().FracCell()[ibox]; + } Cpptraj_GPU_RDF( &RDF_[0], RDF_.size(), maximum2_, one_over_spacing_, outerxyzPtr, OuterMask_.Nselected(), innerxyzPtr, InnerMask_.Nselected(), imageOpt_.ImagingType(), - frm.Frm().BoxCrd().XyzPtr(), - frm.Frm().BoxCrd().UnitCell().Dptr(), - frm.Frm().BoxCrd().FracCell().Dptr() ); + gpu_box, + gpu_ucell, + gpu_frac ); #else /* CUDA */ // Calculation of all atoms in Mask1 to all atoms in Mask2 if (mask2_is_mask1_) diff --git a/src/cpptrajdepend b/src/cpptrajdepend index 08bf715d8c..ec995acc7c 100644 --- a/src/cpptrajdepend +++ b/src/cpptrajdepend @@ -64,7 +64,7 @@ Action_Principal.o : Action_Principal.cpp Action.h ActionState.h Action_Principa Action_Projection.o : Action_Projection.cpp Action.h ActionFrameCounter.h ActionState.h Action_Projection.h ArgList.h Array1D.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_2D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_MatrixDbl.h DataSet_Modes.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_Pucker.o : Action_Pucker.cpp Action.h ActionState.h Action_Pucker.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TorsionRoutines.h TypeNameHolder.h Unit.h Vec3.h Action_Radgyr.o : Action_Radgyr.cpp Action.h ActionState.h Action_Radgyr.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h -Action_Radial.o : Action_Radial.cpp Action.h ActionState.h Action_Radial.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h cuda_kernels/kernel_rdf.cuh +Action_Radial.o : Action_Radial.cpp Action.h ActionState.h Action_Radial.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h Gpu.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h cuda_kernels/kernel_rdf.cuh Action_RandomizeIons.o : Action_RandomizeIons.cpp Action.h ActionState.h Action_RandomizeIons.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h DistRoutines.h FileIO.h FileName.h FileTypes.h Frame.h ImageOption.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Random.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_Remap.o : Action_Remap.cpp Action.h ActionState.h ActionTopWriter.h Action_Remap.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Action_ReplicateCell.o : Action_ReplicateCell.cpp Action.h ActionFrameCounter.h ActionState.h ActionTopWriter.h Action_ReplicateCell.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h OutputTrajCommon.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajectoryFile.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h diff --git a/src/cuda_kernels/core_kernels.cu b/src/cuda_kernels/core_kernels.cu index 6815c73477..a412a63f34 100644 --- a/src/cuda_kernels/core_kernels.cu +++ b/src/cuda_kernels/core_kernels.cu @@ -457,26 +457,26 @@ __global__ void kClosestDistsToAtoms_Nonortho(double*D_, // ----------------------------------------------------------------------------- /** Bin distances from two non-overlapping sets of coords. */ __global__ void kBinDistances_nonOverlap_NoImage(int* RDF, - const double* xyz1, int N1, const double* xyz2, int N2, - double maximum2, double one_over_spacing) + const CpptrajGpu::FpType* xyz1, int N1, const CpptrajGpu::FpType* xyz2, int N2, + CpptrajGpu::FpType maximum2, CpptrajGpu::FpType one_over_spacing) { int a1 = blockIdx.x * blockDim.x + threadIdx.x; int a2 = blockIdx.y * blockDim.y + threadIdx.y; if (a1 < N1 && a2 < N2) { int idx1 = a1 * 3; - double a1x = xyz1[idx1 ]; - double a1y = xyz1[idx1+1]; - double a1z = xyz1[idx1+2]; + CpptrajGpu::FpType a1x = xyz1[idx1 ]; + CpptrajGpu::FpType a1y = xyz1[idx1+1]; + CpptrajGpu::FpType a1z = xyz1[idx1+2]; int idx2 = a2 * 3; - double x = a1x - xyz2[idx2 ]; - double y = a1y - xyz2[idx2+1]; - double z = a1z - xyz2[idx2+2]; + CpptrajGpu::FpType x = a1x - xyz2[idx2 ]; + CpptrajGpu::FpType y = a1y - xyz2[idx2+1]; + CpptrajGpu::FpType z = a1z - xyz2[idx2+2]; - double dist2 = (x*x) + (y*y) + (z*z); + CpptrajGpu::FpType dist2 = (x*x) + (y*y) + (z*z); if (dist2 > 0 && dist2 <= maximum2) { - double dist = sqrt(dist2); + CpptrajGpu::FpType dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); @@ -489,27 +489,27 @@ __global__ void kBinDistances_nonOverlap_NoImage(int* RDF, /** Bin distances from two non-overlapping sets of coords. */ __global__ void kBinDistances_nonOverlap_Ortho(int* RDF, - const double* xyz1, int N1, const double* xyz2, int N2, - const double* box, - double maximum2, double one_over_spacing) + const CpptrajGpu::FpType* xyz1, int N1, const CpptrajGpu::FpType* xyz2, int N2, + const CpptrajGpu::FpType* box, + CpptrajGpu::FpType maximum2, CpptrajGpu::FpType one_over_spacing) { int a1 = blockIdx.x * blockDim.x + threadIdx.x; int a2 = blockIdx.y * blockDim.y + threadIdx.y; if (a1 < N1 && a2 < N2) { int idx1 = a1 * 3; - double a1x = xyz1[idx1 ]; - double a1y = xyz1[idx1+1]; - double a1z = xyz1[idx1+2]; + CpptrajGpu::FpType a1x = xyz1[idx1 ]; + CpptrajGpu::FpType a1y = xyz1[idx1+1]; + CpptrajGpu::FpType a1z = xyz1[idx1+2]; int idx2 = a2 * 3; - double a2x = xyz2[idx2 ]; - double a2y = xyz2[idx2+1]; - double a2z = xyz2[idx2+2]; + CpptrajGpu::FpType a2x = xyz2[idx2 ]; + CpptrajGpu::FpType a2y = xyz2[idx2+1]; + CpptrajGpu::FpType a2z = xyz2[idx2+2]; - double dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); + CpptrajGpu::FpType dist2 = ortho_dist2(a1x, a1y, a1z, a2x, a2y, a2z, box); if (dist2 > 0 && dist2 <= maximum2) { - double dist = sqrt(dist2); + CpptrajGpu::FpType dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); @@ -522,33 +522,33 @@ __global__ void kBinDistances_nonOverlap_Ortho(int* RDF, /** Bin distances from two non-overlapping sets of coords. */ __global__ void kBinDistances_nonOverlap_nonOrtho(int* RDF, - const double* xyz1, int N1, const double* xyz2, int N2, - const double* frac, const double* ucell, - double maximum2, double one_over_spacing) + const CpptrajGpu::FpType* xyz1, int N1, const CpptrajGpu::FpType* xyz2, int N2, + const CpptrajGpu::FpType* frac, const CpptrajGpu::FpType* ucell, + CpptrajGpu::FpType maximum2, CpptrajGpu::FpType one_over_spacing) { int a1 = blockIdx.x * blockDim.x + threadIdx.x; int a2 = blockIdx.y * blockDim.y + threadIdx.y; if (a1 < N1 && a2 < N2) { int idx1 = a1 * 3; - double a1x = xyz1[idx1 ]; - double a1y = xyz1[idx1+1]; - double a1z = xyz1[idx1+2]; - double f1x = frac[0]*a1x + frac[1]*a1y + frac[2]*a1z; - double f1y = frac[3]*a1x + frac[4]*a1y + frac[5]*a1z; - double f1z = frac[6]*a1x + frac[7]*a1y + frac[8]*a1z; + CpptrajGpu::FpType a1x = xyz1[idx1 ]; + CpptrajGpu::FpType a1y = xyz1[idx1+1]; + CpptrajGpu::FpType a1z = xyz1[idx1+2]; + CpptrajGpu::FpType f1x = frac[0]*a1x + frac[1]*a1y + frac[2]*a1z; + CpptrajGpu::FpType f1y = frac[3]*a1x + frac[4]*a1y + frac[5]*a1z; + CpptrajGpu::FpType f1z = frac[6]*a1x + frac[7]*a1y + frac[8]*a1z; int idx2 = a2 * 3; - double a2x = xyz2[idx2 ]; - double a2y = xyz2[idx2+1]; - double a2z = xyz2[idx2+2]; - double f2x = frac[0]*a2x + frac[1]*a2y + frac[2]*a2z; - double f2y = frac[3]*a2x + frac[4]*a2y + frac[5]*a2z; - double f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; - - double dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); + CpptrajGpu::FpType a2x = xyz2[idx2 ]; + CpptrajGpu::FpType a2y = xyz2[idx2+1]; + CpptrajGpu::FpType a2z = xyz2[idx2+2]; + CpptrajGpu::FpType f2x = frac[0]*a2x + frac[1]*a2y + frac[2]*a2z; + CpptrajGpu::FpType f2y = frac[3]*a2x + frac[4]*a2y + frac[5]*a2z; + CpptrajGpu::FpType f2z = frac[6]*a2x + frac[7]*a2y + frac[8]*a2z; + + CpptrajGpu::FpType dist2 = NonOrtho_dist2(f2x, f2y, f2z, f1x ,f1y, f1z, ucell); if (dist2 > 0 && dist2 <= maximum2) { - double dist = sqrt(dist2); + CpptrajGpu::FpType dist = sqrt(dist2); int histIdx = (int) (dist * one_over_spacing); //printf("DEBUG: a1= %i a2= %i dist= %f bin=%i\n", a1+1, a2+1, dist, histIdx); //printf("DEBUG: xyz1= %f %f %f\n", a1x, a1y, a1z); diff --git a/src/cuda_kernels/core_kernels.cuh b/src/cuda_kernels/core_kernels.cuh index b166d2536c..c49816c045 100644 --- a/src/cuda_kernels/core_kernels.cuh +++ b/src/cuda_kernels/core_kernels.cuh @@ -1,5 +1,6 @@ #ifndef INC_CORE_KERNELS_CUH #define INC_CORE_KERNELS_CUH +#include "../Gpu.h" #if defined(__HIP_PLATFORM_HCC__) #include #endif @@ -14,12 +15,12 @@ __global__ void kClosestDistsToAtoms_Ortho(double*,const double*,const double*,d __global__ void kClosestDistsToPt_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int); __global__ void kClosestDistsToAtoms_Nonortho(double*,const double*,const double*,double,const double*,const double*,int,int,int,int); // RDF no imaging -__global__ void kBinDistances_nonOverlap_NoImage(int*, const double*, int, const double*, int, - double, double); +__global__ void kBinDistances_nonOverlap_NoImage(int*, const CpptrajGpu::FpType*, int, const CpptrajGpu::FpType*, int, + CpptrajGpu::FpType, CpptrajGpu::FpType); // RDF ortho imaging -__global__ void kBinDistances_nonOverlap_Ortho(int*, const double*, int, const double*, int, - const double*, double, double); +__global__ void kBinDistances_nonOverlap_Ortho(int*, const CpptrajGpu::FpType*, int, const CpptrajGpu::FpType*, int, + const CpptrajGpu::FpType*, CpptrajGpu::FpType, CpptrajGpu::FpType); // RDF nonortho imaging -__global__ void kBinDistances_nonOverlap_nonOrtho(int*, const double*, int, const double*, int, - const double*, const double*, double, double); +__global__ void kBinDistances_nonOverlap_nonOrtho(int*, const CpptrajGpu::FpType*, int, const CpptrajGpu::FpType*, int, + const CpptrajGpu::FpType*, const CpptrajGpu::FpType*, CpptrajGpu::FpType, CpptrajGpu::FpType); #endif diff --git a/src/cuda_kernels/cudadepend b/src/cuda_kernels/cudadepend index 9c8586d128..d28d3ad970 100644 --- a/src/cuda_kernels/cudadepend +++ b/src/cuda_kernels/cudadepend @@ -1,5 +1,5 @@ GistCudaCalc.o : GistCudaCalc.cu GistCudaCalc.cuh GistCudaSetup.o : GistCudaSetup.cu ../HipDefinitions.h GistCudaCalc.cuh GistCudaSetup.cuh -core_kernels.o : core_kernels.cu core_kernels.cuh +core_kernels.o : core_kernels.cu ../Gpu.h NonOrtho_dist2.cuh core_kernels.cuh ortho_dist2.cuh kernel_rdf.o : kernel_rdf.cu ../CpptrajStdio.h ../Gpu.h ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_rdf.cuh -kernel_wrappers.o : kernel_wrappers.cu ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_wrappers.cuh +kernel_wrappers.o : kernel_wrappers.cu ../Gpu.h ../HipDefinitions.h ../ImageOption.h core_kernels.cuh kernel_wrappers.cuh diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index 3310c7de65..afb536ed69 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -1,7 +1,6 @@ #include "kernel_rdf.cuh" #include "core_kernels.cuh" #include "../CpptrajStdio.h" -#include "../Gpu.h" #if defined(__HIP_PLATFORM_HCC__) #include #include "../HipDefinitions.h" @@ -27,25 +26,25 @@ static inline int Cuda_check(cudaError_t err, const char* desc) { } /** Calculate distances between pairs of atoms and bin them into a 1D histogram. */ -int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_over_spacing, - const double* xyz1, int N1, - const double* xyz2, int N2, +int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, CpptrajGpu::FpType maximum2, CpptrajGpu::FpType one_over_spacing, + const CpptrajGpu::FpType* xyz1, int N1, + const CpptrajGpu::FpType* xyz2, int N2, ImageOption::Type imageType, - const double* box, const double* ucell, const double* recip) + const CpptrajGpu::FpType* box, const CpptrajGpu::FpType* ucell, const CpptrajGpu::FpType* recip) { int* device_rdf; Cuda_check(cudaMalloc(((void**)(&device_rdf)), nbins * sizeof(int)), "Allocating rdf bins"); Cuda_check(cudaMemset( device_rdf, 0, nbins*sizeof(int) ), "Setting rdf bins to 0"); - double* device_xyz1; - Cuda_check(cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(double)), "Allocating xyz1"); - Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz1"); + CpptrajGpu::FpType* device_xyz1; + Cuda_check(cudaMalloc(((void**)(&device_xyz1)), N1 * 3 * sizeof(CpptrajGpu::FpType)), "Allocating xyz1"); + Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying xyz1"); - double* device_xyz2; + CpptrajGpu::FpType* device_xyz2; //int* device_rows1; if (xyz2 != 0) { - Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(double)), "Allocating xyz2"); - Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying xyz2"); + Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(CpptrajGpu::FpType)), "Allocating xyz2"); + Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying xyz2"); } else { device_xyz2 = device_xyz1; // Create array of upper triangle row start indices TODO long ints? @@ -67,16 +66,16 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, double maximum2, double one_ } - double *boxDev; - double *ucellDev, *fracDev; + CpptrajGpu::FpType *boxDev; + CpptrajGpu::FpType *ucellDev, *fracDev; if (imageType == ImageOption::ORTHO) { - Cuda_check(cudaMalloc(((void**)(&boxDev)), 3 * sizeof(double)), "Allocating box"); - Cuda_check(cudaMemcpy(boxDev,box, 3 * sizeof(double), cudaMemcpyHostToDevice), "Copying box"); + Cuda_check(cudaMalloc(((void**)(&boxDev)), 3 * sizeof(CpptrajGpu::FpType)), "Allocating box"); + Cuda_check(cudaMemcpy(boxDev,box, 3 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying box"); } else if (imageType == ImageOption::NONORTHO) { - Cuda_check(cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(double)), "Allocating ucell"); - Cuda_check(cudaMalloc(((void**)(&fracDev)), 9 * sizeof(double)), "Allocating frac"); - Cuda_check(cudaMemcpy(ucellDev,ucell, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying ucell"); - Cuda_check(cudaMemcpy(fracDev,recip, 9 * sizeof(double), cudaMemcpyHostToDevice), "Copying frac"); + Cuda_check(cudaMalloc(((void**)(&ucellDev)), 9 * sizeof(CpptrajGpu::FpType)), "Allocating ucell"); + Cuda_check(cudaMalloc(((void**)(&fracDev)), 9 * sizeof(CpptrajGpu::FpType)), "Allocating frac"); + Cuda_check(cudaMemcpy(ucellDev,ucell, 9 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying ucell"); + Cuda_check(cudaMemcpy(fracDev,recip, 9 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying frac"); } // Determine number of blocks diff --git a/src/cuda_kernels/kernel_rdf.cuh b/src/cuda_kernels/kernel_rdf.cuh index 8f70904057..3f6f2a7d80 100644 --- a/src/cuda_kernels/kernel_rdf.cuh +++ b/src/cuda_kernels/kernel_rdf.cuh @@ -1,6 +1,9 @@ #ifndef INC_KERNEL_RDF_CUH #define INC_KERNEL_RDF_CUH +#include "../Gpu.h" #include "../ImageOption.h" -int Cpptraj_GPU_RDF(unsigned long*, int, double, double, const double*, int, const double*, int, - ImageOption::Type, const double*, const double*, const double*); +int Cpptraj_GPU_RDF(unsigned long*, int, CpptrajGpu::FpType, CpptrajGpu::FpType, + const CpptrajGpu::FpType*, int, const CpptrajGpu::FpType*, int, + ImageOption::Type, const CpptrajGpu::FpType*, + const CpptrajGpu::FpType*, const CpptrajGpu::FpType*); #endif From 14412a7b9efc0c0b92a6849c54579fe8c150d2ab Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 11:51:08 -0400 Subject: [PATCH 48/57] Change to floating point --- src/Gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Gpu.h b/src/Gpu.h index c6a433cab4..a637fbca64 100644 --- a/src/Gpu.h +++ b/src/Gpu.h @@ -6,6 +6,6 @@ void SetComputeVersion(int); /// \return Max block dimensions if using 2D blocks unsigned int MaxBlockDim_2D(); /// Default floating point type -typedef double FpType; +typedef float FpType; } #endif From c82d698081edefd7041d8fe42f54ed10c8b957a6 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 12:26:20 -0400 Subject: [PATCH 49/57] Add tolerance for GPU radial calc, now done in single precision --- test/MasterTest.sh | 17 +++++++++++++++-- test/Test_Radial/RunTest.sh | 18 +++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/test/MasterTest.sh b/test/MasterTest.sh index 0073be6f73..581feee1ae 100644 --- a/test/MasterTest.sh +++ b/test/MasterTest.sh @@ -75,6 +75,7 @@ WARNCOUNT=0 # Total number of warnings detected by DoTest this test PROGCOUNT=0 # Total number of times RunCpptraj has been called this test. PROGERROR=0 # Total number of program errors this test CHECKERR=0 # Total errors this test from CheckEnv routine. +CHECKSILENT=0 # If 1 make the TestLibrary routine silent TESTNAME='' # Current test name for Requires routine. UNITNAME='' # Current unit name for CheckFor routine. DESCRIP='' # Current test/unit name for CheckEnv routine. @@ -988,8 +989,10 @@ SkipCheck() { # CheckDefines. TestLibrary() { if [ -z "$2" ] ; then - echo " $DESCRIP requires $1." - echo " Cpptraj was compiled without $1 support." + if [ $CHECKSILENT -eq 0 ] ; then + echo " $DESCRIP requires $1." + echo " Cpptraj was compiled without $1 support." + fi ((CHECKERR++)) fi } @@ -1172,6 +1175,16 @@ CheckFor() { return 0 } +# CheckForSilent() +# \return > 0 if anything in is not present, 0 otherwise. +CheckForSilent() { + CHECKSILENT=1 + # Give generic name + DESCRIP='Test' + CheckEnv $* + return $CHECKERR +} + # ------------------------------------------------------------------------------ # TODO remove all deprecated functions below. # FIXME This is a stub and should be removed diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index 24ded92264..be0a1e96d3 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -10,6 +10,14 @@ CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ TESTNAME='Radial tests' Requires netcdf maxthreads 10 +# CUDA requires a tolerance since it is done in single precision +TOL_ARG='' +CheckForSilent cuda +if [ $? -eq 0 ] ; then + TOL_ARG='-a 0.0002' + echo "Lowering tolerance for single-precision CUDA: $TOL_ARG" +fi + INPUT="-i radial.in" UNITNAME='Radial test, non-orthogonal imaging' @@ -78,7 +86,7 @@ trajin ../tz2.truncoct.nc radial WatO-Prot :WAT@O ^1 out tz2.WatO-Prot.agr 0.25 15.0 EOF RunCpptraj "$UNITNAME" -DoTest tz2.WatO-Prot.agr.save tz2.WatO-Prot.agr +DoTest tz2.WatO-Prot.agr.save tz2.WatO-Prot.agr $TOL_ARG UNITNAME='Radial test, water -> water, non-orthogonal cell' CheckFor maxthreads 2 @@ -89,7 +97,7 @@ trajin ../tz2.truncoct.nc 1 2 radial WatO :WAT@O out tz2.WatO.agr 0.25 15.0 EOF RunCpptraj "$UNITNAME" - DoTest tz2.WatO.agr.save tz2.WatO.agr + DoTest tz2.WatO.agr.save tz2.WatO.agr $TOL_ARG fi UNITNAME='Radial test, water -> protein, orthogonal cell' @@ -99,7 +107,7 @@ trajin ../tz2.ortho.nc radial WatO-Prot :WAT@O ^1 out tz2ortho.WatO-Prot.agr 0.25 15.0 EOF RunCpptraj "$UNITNAME" -DoTest tz2ortho.WatO-Prot.agr.save tz2ortho.WatO-Prot.agr +DoTest tz2ortho.WatO-Prot.agr.save tz2ortho.WatO-Prot.agr $TOL_ARG UNITNAME='Radial test, water -> water, orthogonal cell' CheckFor maxthreads 2 @@ -110,7 +118,7 @@ trajin ../tz2.ortho.nc 1 2 radial WatO :WAT@O out tz2ortho.WatO.agr 0.25 15.0 EOF RunCpptraj "$UNITNAME" - DoTest tz2ortho.WatO.agr.save tz2ortho.WatO.agr + DoTest tz2ortho.WatO.agr.save tz2ortho.WatO.agr $TOL_ARG fi UNITNAME='Radial test, water -> protein, no imaging' @@ -120,7 +128,7 @@ trajin ../tz2.ortho.nc radial WatO-Prot :WAT@O ^1 out tz2noimage.WatO-Prot.agr 0.25 15.0 noimage EOF RunCpptraj "$UNITNAME" -DoTest tz2noimage.WatO-Prot.agr.save tz2noimage.WatO-Prot.agr +DoTest tz2noimage.WatO-Prot.agr.save tz2noimage.WatO-Prot.agr $TOL_ARG UNITNAME='Radial test, water -> water, no imaging' CheckFor maxthreads 2 From 8a1bf0e0639687b44683a587aa5625f16124d3ad Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 12:28:54 -0400 Subject: [PATCH 50/57] Add kernel_rdf.cu to CMakeLists.txt --- src/cuda_kernels/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cuda_kernels/CMakeLists.txt b/src/cuda_kernels/CMakeLists.txt index 5f305ce7f1..9d24d42906 100644 --- a/src/cuda_kernels/CMakeLists.txt +++ b/src/cuda_kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu) +set(CPPTRAJ_CUDA_SOURCES core_kernels.cu kernel_wrappers.cu GistCudaCalc.cu GistCudaSetup.cu kernel_rdf.cu) cuda_add_library(cpptraj_cuda_routines STATIC ${CPPTRAJ_CUDA_SOURCES}) #make_pic_if_needed does not appear to work for NVCC. Instead, added From c203562b398c08a34517f5d543dfdfff6bf1b4c5 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 12:45:29 -0400 Subject: [PATCH 51/57] Add a note about the rdf on GPU being done in single precision. --- doc/cpptraj.lyx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx index 3985e5aa19..49b7bd0271 100644 --- a/doc/cpptraj.lyx +++ b/doc/cpptraj.lyx @@ -1230,6 +1230,10 @@ watershell gist \end_layout +\begin_layout LyX-Code +radial +\end_layout + \begin_layout Section General Concepts \end_layout @@ -35041,6 +35045,14 @@ Note that correct normalization of the RDF depends on the number of atoms be off. \end_layout +\begin_layout Standard +The basic (i.e. + no center1/center2/byres1/byres2/bymol1/bymol2) RDF calculations are now + CUDA parallelized. + However, the calculation is done in single-precision on GPUs so the resulting + histograms may differ slightly from the CPU (on the order of 0.0002 - 0.0004). +\end_layout + \begin_layout Subsection randomizeions \end_layout From d772fa0a5ca9d85274642785cd6016e3d918ac59 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 13:33:45 -0400 Subject: [PATCH 52/57] 6.11.0. Minor version bump for gpu version of RDF (single precision, results will change from previous cpptraj.cuda). --- src/Version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Version.h b/src/Version.h index 47ba1d64ec..965a5cf0d2 100644 --- a/src/Version.h +++ b/src/Version.h @@ -12,7 +12,7 @@ * Whenever a number that precedes is incremented, all subsequent * numbers should be reset to 0. */ -#define CPPTRAJ_INTERNAL_VERSION "V6.10.0" +#define CPPTRAJ_INTERNAL_VERSION "V6.11.0" /// PYTRAJ relies on this #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION #endif From e30e5599f99b95f89cb98e2a37b178c0f3e51557 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 14:06:44 -0400 Subject: [PATCH 53/57] Make openmp single mask version loop dynamic --- src/Action_Radial.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index 7be765cf05..acd1f65669 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -480,7 +480,7 @@ void Action_Radial::calcRDF_singleMask(Frame const& frmIn) { { //mprintf("OPENMP: %i threads\n",omp_get_num_threads()); myRDF = &(rdf_thread_[omp_get_thread_num()][0]); -# pragma omp for +# pragma omp for schedule(dynamic) # endif for (idx1 = 0; idx1 < outer_max; idx1++) { const double* xyz1 = frmIn.XYZ( OuterMask_[idx1] ); From 0b5dccefc15602afe0999773ee68b0cea499e36b Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 14:19:28 -0400 Subject: [PATCH 54/57] Move to separate test repo --- test/Test_Radial/Test.sh | 50 ---------------------- test/Test_Radial/watO-protein.agr.save | 28 ------------ test/Test_Radial/watO-protein.raw.agr.save | 28 ------------ test/Test_Radial/watO.agr.save | 28 ------------ test/Test_Radial/watO.raw.agr.save | 28 ------------ 5 files changed, 162 deletions(-) delete mode 100755 test/Test_Radial/Test.sh delete mode 100644 test/Test_Radial/watO-protein.agr.save delete mode 100644 test/Test_Radial/watO-protein.raw.agr.save delete mode 100644 test/Test_Radial/watO.agr.save delete mode 100644 test/Test_Radial/watO.raw.agr.save diff --git a/test/Test_Radial/Test.sh b/test/Test_Radial/Test.sh deleted file mode 100755 index 092ffc489d..0000000000 --- a/test/Test_Radial/Test.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -. ../MasterTest.sh - -CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ - WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ - WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ - point?.agr wat.origin.agr \ - watO-protein.agr watO-protein.raw.agr \ - watO.agr watO.raw.agr - -TESTNAME='Radial tests' -Requires netcdf maxthreads 10 - -INPUT="-i radial.in" - -Radial_WatProt_Nonortho() { - UNITNAME='Radial test (water-protein), non-orthogonal imaging' - cat > radial.in < radial.in < ^1" -@target G0.S0 -@type xy - 0.250 0.000000 - 0.750 0.000000 - 1.250 0.000000 - 1.750 0.056104 - 2.250 0.076568 - 2.750 0.300826 - 3.250 0.379005 - 3.750 0.468293 - 4.250 0.500427 - 4.750 0.494545 - 5.250 0.551886 - 5.750 0.577458 - 6.250 0.609696 - 6.750 0.652733 - 7.250 0.679748 - 7.750 0.696387 - 8.250 0.764467 - 8.750 0.783383 - 9.250 0.809392 - 9.750 0.828473 diff --git a/test/Test_Radial/watO-protein.raw.agr.save b/test/Test_Radial/watO-protein.raw.agr.save deleted file mode 100644 index d774ba2c68..0000000000 --- a/test/Test_Radial/watO-protein.raw.agr.save +++ /dev/null @@ -1,28 +0,0 @@ -@with g0 -@ xaxis label "Distance (Ang)" -@ yaxis label "" -@ legend 0.2, 0.995 -@ legend char size 0.60 -@ s0 legend "Raw[:WAT@O => ^1]" -@target G0.S0 -@type xy - 0.250 0.000000 - 0.750 0.000000 - 1.250 0.000000 - 1.750 16.000000 - 2.250 36.000000 - 2.750 211.000000 - 3.250 371.000000 - 3.750 610.000000 - 4.250 837.000000 - 4.750 1033.000000 - 5.250 1408.000000 - 5.750 1767.000000 - 6.250 2204.000000 - 6.750 2752.000000 - 7.250 3306.000000 - 7.750 3870.000000 - 8.250 4814.000000 - 8.750 5549.000000 - 9.250 6407.000000 - 9.750 7286.000000 diff --git a/test/Test_Radial/watO.agr.save b/test/Test_Radial/watO.agr.save deleted file mode 100644 index be8786dfdc..0000000000 --- a/test/Test_Radial/watO.agr.save +++ /dev/null @@ -1,28 +0,0 @@ -@with g0 -@ xaxis label "Distance (Ang)" -@ yaxis label "" -@ legend 0.2, 0.995 -@ legend char size 0.60 -@ s0 legend ":WAT@O => :WAT@O" -@target G0.S0 -@type xy - 0.250 0.000000 - 0.750 0.000000 - 1.250 0.000000 - 1.750 0.000000 - 2.250 0.004509 - 2.750 1.774147 - 3.250 1.080181 - 3.750 0.920772 - 4.250 0.945101 - 4.750 0.958743 - 5.250 0.961664 - 5.750 0.934807 - 6.250 0.963383 - 6.750 0.978025 - 7.250 0.964305 - 7.750 0.958883 - 8.250 0.947129 - 8.750 0.952076 - 9.250 0.957114 - 9.750 0.961629 diff --git a/test/Test_Radial/watO.raw.agr.save b/test/Test_Radial/watO.raw.agr.save deleted file mode 100644 index afcb24585d..0000000000 --- a/test/Test_Radial/watO.raw.agr.save +++ /dev/null @@ -1,28 +0,0 @@ -@with g0 -@ xaxis label "Distance (Ang)" -@ yaxis label "" -@ legend 0.2, 0.995 -@ legend char size 0.60 -@ s0 legend "Raw[:WAT@O => :WAT@O]" -@target G0.S0 -@type xy - 0.250 0.000000 - 0.750 0.000000 - 1.250 0.000000 - 1.750 0.000000 - 2.250 18.000000 - 2.750 10566.000000 - 3.250 8978.000000 - 3.750 10184.000000 - 4.250 13422.000000 - 4.750 17004.000000 - 5.250 20832.000000 - 5.750 24288.000000 - 6.250 29570.000000 - 6.750 35012.000000 - 7.250 39822.000000 - 7.750 45246.000000 - 8.250 50642.000000 - 8.750 57262.000000 - 9.250 64330.000000 - 9.750 71808.000000 From acbb714ebbae54e8ffebd6a2d27207794011f33e Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 14:29:14 -0400 Subject: [PATCH 55/57] Remove obsolete code --- src/Action_Radial.cpp | 34 +--------------------------------- src/cuda_kernels/kernel_rdf.cu | 18 ------------------ 2 files changed, 1 insertion(+), 51 deletions(-) diff --git a/src/Action_Radial.cpp b/src/Action_Radial.cpp index acd1f65669..8ce1fcbf61 100644 --- a/src/Action_Radial.cpp +++ b/src/Action_Radial.cpp @@ -456,6 +456,7 @@ Action::RetType Action_Radial::Setup(ActionSetup& setup) { } #ifdef CUDA +/** Place coords for selected atoms into arrays. */ static inline std::vector mask_to_xyz(AtomMask const& Mask, Frame const& frm) { std::vector outerxyz; @@ -589,39 +590,6 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) { calcRDF_singleMask( frm.Frm() ); else calcRDF_twoMask( frm.Frm() ); -/* int outer_max = OuterMask_.Nselected(); - int inner_max = InnerMask_.Nselected(); -# ifdef _OPENMP -# pragma omp parallel private(nmask1,nmask2,atom1,atom2,D,idx,mythread) - { - //mprintf("OPENMP: %i threads\n",omp_get_num_threads()); - mythread = omp_get_thread_num(); -# pragma omp for -# endif - for (nmask1 = 0; nmask1 < outer_max; nmask1++) { - atom1 = OuterMask_[nmask1]; - for (nmask2 = 0; nmask2 < inner_max; nmask2++) { - atom2 = InnerMask_[nmask2]; - if (atom1 != atom2) { - D = DIST2( imageOpt_.ImagingType(), frm.Frm().XYZ(atom1), frm.Frm().XYZ(atom2), frm.Frm().BoxCrd() ); - if (D <= maximum2_) { - // NOTE: Can we modify the histogram to store D^2? - D = sqrt(D); - //mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D); - idx = (int) (D * one_over_spacing_); - if (idx > -1 && idx < numBins_) -# ifdef _OPENMP - ++rdf_thread_[mythread][idx]; -# else - ++RDF_[idx]; -# endif - } - } - } // END loop over 2nd mask - } // END loop over 1st mask -# ifdef _OPENMP - } // END pragma omp parallel -# endif*/ #endif /* CUDA */ // --------------------------------------------- } else if ( rmode_ == NO_INTRAMOL ) { diff --git a/src/cuda_kernels/kernel_rdf.cu b/src/cuda_kernels/kernel_rdf.cu index afb536ed69..c007d14db9 100644 --- a/src/cuda_kernels/kernel_rdf.cu +++ b/src/cuda_kernels/kernel_rdf.cu @@ -41,31 +41,13 @@ int Cpptraj_GPU_RDF(unsigned long* bins, int nbins, CpptrajGpu::FpType maximum2, Cuda_check(cudaMemcpy(device_xyz1, xyz1, N1 * 3 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying xyz1"); CpptrajGpu::FpType* device_xyz2; - //int* device_rows1; if (xyz2 != 0) { Cuda_check(cudaMalloc(((void**)(&device_xyz2)), N2 * 3 * sizeof(CpptrajGpu::FpType)), "Allocating xyz2"); Cuda_check(cudaMemcpy(device_xyz2, xyz2, N2 * 3 * sizeof(CpptrajGpu::FpType), cudaMemcpyHostToDevice), "Copying xyz2"); } else { device_xyz2 = device_xyz1; - // Create array of upper triangle row start indices TODO long ints? -/* int* rows1 = new int[ N1 ]; - int i1 = 0; - for (unsigned int row = 0; row < N1; row++) { - for (unsigned int col = row+1; col < N1; col++) { - long int idx = calcTriIndex( N1, col, row ); - if (col == row + 1) { - printf("ROW START: %li\n", idx); - //if (row > 0) Rows1.push_back( idx ); - rows1[i1++] = idx; - //Rows1.push_back( idx ); - } - printf("%8zu %8zu = %8li\n", col, row, idx); - } - } - delete[] rows1;*/ } - CpptrajGpu::FpType *boxDev; CpptrajGpu::FpType *ucellDev, *fracDev; if (imageType == ImageOption::ORTHO) { From 86b9c06fd1708bc9a334cad91af63f646b2d9402 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 14:34:11 -0400 Subject: [PATCH 56/57] Hide thread info unless debug specified. Report total compute capability version. --- src/Cpptraj.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Cpptraj.cpp b/src/Cpptraj.cpp index 961c28c0e3..a740122c9a 100644 --- a/src/Cpptraj.cpp +++ b/src/Cpptraj.cpp @@ -123,14 +123,16 @@ void Cpptraj::Intro() const { mprintf("| CUDA device: %s\n", deviceProp.name); mprintf("| Available GPU memory: %s\n", ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str()); - mprintf("| Major compute capability: %i\n", deviceProp.major); + mprintf("| Compute capability: %i.%i\n", deviceProp.major, deviceProp.minor); CpptrajGpu::SetComputeVersion( deviceProp.major ); - mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); - mprintf("| Max threads dim: %i %i %i\n", - deviceProp.maxThreadsDim[0], - deviceProp.maxThreadsDim[1], - deviceProp.maxThreadsDim[2]); - mprintf("| Max threads/multiprocessor: %i\n", deviceProp.maxThreadsPerMultiProcessor); + if (State_.Debug() > 0) { + mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock); + mprintf("| Max threads dim: %i %i %i\n", + deviceProp.maxThreadsDim[0], + deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + mprintf("| Max threads/multiprocessor: %i\n", deviceProp.maxThreadsPerMultiProcessor); + } } } # endif From 989b9abb21a05da914df73f46ebb6d6d356f7d05 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 11 Jul 2022 14:38:09 -0400 Subject: [PATCH 57/57] Fix test clean --- test/Test_Radial/RunTest.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/Test_Radial/RunTest.sh b/test/Test_Radial/RunTest.sh index be0a1e96d3..74caaff563 100755 --- a/test/Test_Radial/RunTest.sh +++ b/test/Test_Radial/RunTest.sh @@ -5,7 +5,9 @@ CleanFiles radial.in Radial.agr cRadial.agr WatO-Trp4.agr WatO-Trp4.raw.agr \ WatO-Trp4.byres.agr WatO-Trp.agr WatO-Trp.volume.agr \ WatO-Glu5CD.agr noimage.WatO-Glu5CD.agr point.dat \ - point?.agr wat.origin.agr tz2.WatO-Prot.agr tz2.WatO.agr + point?.agr wat.origin.agr tz2.WatO-Prot.agr tz2.WatO.agr \ + tz2ortho.WatO-Prot.agr tz2ortho.WatO.agr \ + tz2noimage.WatO-Prot.agr tz2noimage.WatO.agr TESTNAME='Radial tests' Requires netcdf maxthreads 10 @@ -16,7 +18,7 @@ CheckForSilent cuda if [ $? -eq 0 ] ; then TOL_ARG='-a 0.0002' echo "Lowering tolerance for single-precision CUDA: $TOL_ARG" -fi +fitz2noimage.WatO-Prot.agr INPUT="-i radial.in"