Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
45ea34b
Start rdf gpu kernel
drroe Jul 6, 2022
a1030d6
Add rdf entry function to Makefile
drroe Jul 7, 2022
cd046c6
Add test for RDF
drroe Jul 7, 2022
d849cf5
Fix num blocks calc
drroe Jul 7, 2022
39a1b3a
Change block dim to 32 to avoid creating too many threads per block. …
drroe Jul 7, 2022
ff9fae4
Add maximum2 and one_over_spacing
drroe Jul 7, 2022
f54d9cf
Add atomicAdd to the RDF histogram
drroe Jul 8, 2022
a729aba
Add kernel launch. Ensure device histogram memory is zeroed
drroe Jul 8, 2022
75c3913
Fix fxn definition and call
drroe Jul 8, 2022
ea3c658
Ensure results are copied back. Make sure coords are uploaded to device.
drroe Jul 8, 2022
e00293f
Add check that spacing is not greater than the max
drroe Jul 8, 2022
6fcd283
Add separate test
drroe Jul 8, 2022
7245e38
Merge remote-tracking branch 'origin/rdfgpu' into rdfgpu
drroe Jul 8, 2022
2775909
Enable some debug info. Do kernel error checking
drroe Jul 8, 2022
d560184
Report max threads per block
drroe Jul 8, 2022
50dd4eb
Hide some debug info. Make cuda errors non-fatal while debugging
drroe Jul 8, 2022
84aa280
Better labeling
drroe Jul 8, 2022
957af26
Improve error checking for memory allocation and setting. Correct inv…
drroe Jul 8, 2022
99ec3c4
Report max threads in each dim
drroe Jul 8, 2022
1c34100
Report max threads per multiprocessor
drroe Jul 8, 2022
e69eb39
Report major compute capability.
drroe Jul 8, 2022
77ed890
Add Gpu namespace to store Gpu related info
drroe Jul 8, 2022
87ec305
Try to set block size based on compute capability
drroe Jul 8, 2022
04ef617
Change recip to frac
drroe Jul 8, 2022
c5ec2e5
Add rdf with ortho imaging
drroe Jul 8, 2022
8edb99b
Add version with no imaging
drroe Jul 8, 2022
0bca1c5
Enable ortho and no image cases
drroe Jul 8, 2022
a068892
Attempt to make functions usable with overlapping coords
drroe Jul 8, 2022
9a2b476
Add overlapping test
drroe Jul 8, 2022
d271e89
Enable for overlapping coords.
drroe Jul 8, 2022
96a1d2b
Only works for NORMAL mode right now
drroe Jul 8, 2022
dc32bb4
Enable WatProt test
drroe Jul 8, 2022
53c0b9b
Ensure CUDA only used for NORMAL, others use CPU calc
drroe Jul 8, 2022
b550b43
Hide some debug info
drroe Jul 8, 2022
4265b9a
Add more realistic test for water-protein rdf
drroe Jul 9, 2022
7ba3b45
Add more realistic water rdf test
drroe Jul 9, 2022
107f6fe
Add orthogonal tests
drroe Jul 9, 2022
3d911d3
Add more realistic tests with no imaging
drroe Jul 9, 2022
c4d42c8
Improve CPU radial function when single mask is being used
drroe Jul 9, 2022
eb12ee7
Always try to build libcpptraj.cuda
drroe Jul 9, 2022
f314034
Start code to store indices for upper triangle matrix
drroe Jul 9, 2022
315119a
Doesnt seem like flattening works so well.
drroe Jul 9, 2022
6c96927
Test out using device function template
drroe Jul 11, 2022
6ca1e96
Add template for non-orthogonal distance calc
drroe Jul 11, 2022
5424013
Use the ortho dist template in the closest kernels
drroe Jul 11, 2022
f11d1ce
Remove old code. Add note about slower ortho calc method
drroe Jul 11, 2022
8e1b734
Start adding a gpu floating point type definition
drroe Jul 11, 2022
163e1d4
Use CpptrajGpu::FpType for GPU radial calcs.
drroe Jul 11, 2022
14412a7
Change to floating point
drroe Jul 11, 2022
c82d698
Add tolerance for GPU radial calc, now done in single precision
drroe Jul 11, 2022
8a1bf0e
Add kernel_rdf.cu to CMakeLists.txt
drroe Jul 11, 2022
c203562
Add a note about the rdf on GPU being done in single precision.
drroe Jul 11, 2022
0f4fdaa
Merge branch 'master' into rdfgpu
drroe Jul 11, 2022
d772fa0
6.11.0. Minor version bump for gpu version of RDF (single precision,
drroe Jul 11, 2022
e30e559
Make openmp single mask version loop dynamic
drroe Jul 11, 2022
0b5dcce
Move to separate test repo
drroe Jul 11, 2022
acbb714
Remove obsolete code
drroe Jul 11, 2022
86b9c06
Hide thread info unless debug specified. Report total compute
drroe Jul 11, 2022
989b9ab
Fix test clean
drroe Jul 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/cpptraj.lyx
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,10 @@ watershell
gist
\end_layout

\begin_layout LyX-Code
radial
\end_layout

\begin_layout Section
General Concepts
\end_layout
Expand Down Expand Up @@ -35041,6 +35045,14 @@ Note that correct normalization of the RDF depends on the number of atoms
be off.
\end_layout

\begin_layout Standard
The basic (i.e.
no center1/center2/byres1/byres2/bymol1/bymol2) RDF calculations are now
CUDA parallelized.
However, the calculation is done in single-precision on GPUs so the resulting
histograms may differ slightly from the CPU (on the order of 0.0002 - 0.0004).
\end_layout

\begin_layout Subsection
randomizeions
\end_layout
Expand Down
166 changes: 132 additions & 34 deletions src/Action_Radial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#ifdef _OPENMP
# include <omp.h>
#endif
#ifdef CUDA
# include "Gpu.h"
# include "cuda_kernels/kernel_rdf.cuh"
#endif

// CONSTRUCTOR
Action_Radial::Action_Radial() :
Expand All @@ -17,6 +21,7 @@ Action_Radial::Action_Radial() :
currentParm_(0),
intramol_distances_(0),
useVolume_(false),
mask2_is_mask1_(false),
volume_(0),
maximum2_(0),
spacing_(-1),
Expand Down Expand Up @@ -119,6 +124,10 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d
Help();
return Action::ERR;
}
if (spacing_ > maximum) {
mprinterr("Error: Bin spacing %g is larger than the maximum %g\n", spacing_, maximum);
return Action::ERR;
}
// Store max^2, distances^2 greater than max^2 do not need to be
// binned and therefore do not need a sqrt calc.
maximum2_ = maximum * maximum;
Expand All @@ -132,12 +141,14 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d
if (Mask1_.SetMaskString(mask1)) return Action::ERR;

// Check for second mask - if none specified use first mask
mask2_is_mask1_ = false;
if (needMask2) {
std::string mask2 = actionArgs.GetMaskNext();
if (!mask2.empty()) {
if (Mask2_.SetMaskString(mask2)) return Action::ERR;
} else {
if (Mask2_.SetMaskString(mask1)) return Action::ERR;
mask2_is_mask1_ = true;
}
}
// If filename not yet specified check for backwards compat.
Expand Down Expand Up @@ -257,6 +268,11 @@ Action::RetType Action_Radial::Init(ArgList& actionArgs, ActionInit& init, int d
mprintf("\tImaging disabled.\n");
if (numthreads_ > 1)
mprintf("\tParallelizing RDF calculation with %i threads.\n",numthreads_);
#ifdef CUDA
if (rmode_ == NORMAL) {
mprintf("\tRDF calculation will be accelerated with CUDA.\n");
}
#endif

return Action::OK;
}
Expand Down Expand Up @@ -439,6 +455,89 @@ Action::RetType Action_Radial::Setup(ActionSetup& setup) {
return Action::OK;
}

#ifdef CUDA
/** Place coords for selected atoms into arrays. */
static inline std::vector<CpptrajGpu::FpType> mask_to_xyz(AtomMask const& Mask, Frame const& frm)
{
std::vector<CpptrajGpu::FpType> outerxyz;
outerxyz.reserve( Mask.Nselected()*3 );
for (AtomMask::const_iterator at = Mask.begin(); at != Mask.end(); ++at) {
const double* xyz = frm.XYZ( *at );
outerxyz.push_back( (CpptrajGpu::FpType)xyz[0] );
outerxyz.push_back( (CpptrajGpu::FpType)xyz[1] );
outerxyz.push_back( (CpptrajGpu::FpType)xyz[2] );
}
return outerxyz;
}
#endif

void Action_Radial::calcRDF_singleMask(Frame const& frmIn) {
int outer_max = OuterMask_.Nselected();
int idx1;

long unsigned* myRDF = &RDF_[0];
# ifdef _OPENMP
# pragma omp parallel private(idx1, myRDF)
{
//mprintf("OPENMP: %i threads\n",omp_get_num_threads());
myRDF = &(rdf_thread_[omp_get_thread_num()][0]);
# pragma omp for schedule(dynamic)
# endif
for (idx1 = 0; idx1 < outer_max; idx1++) {
const double* xyz1 = frmIn.XYZ( OuterMask_[idx1] );
for (int idx2 = idx1 + 1; idx2 < outer_max; idx2++) {
const double* xyz2 = frmIn.XYZ( OuterMask_[idx2] );
double D2 = DIST2( imageOpt_.ImagingType(), xyz1, xyz2, frmIn.BoxCrd() );
if (D2 <= maximum2_) {
// NOTE: Can we modify the histogram to store D^2?
double dist = sqrt(D2);
//mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D);
int idx = (int) (dist * one_over_spacing_);
//if (idx > -1 && idx < numBins_) {
myRDF[idx] += 2;
//}
}
} // END inner loop
} // END outer loop
# ifdef _OPENMP
} // END pragma omp parallel
# endif
}

void Action_Radial::calcRDF_twoMask(Frame const& frmIn) {
int outer_max = OuterMask_.Nselected();
int inner_max = InnerMask_.Nselected();
int idx1;

long unsigned* myRDF = &RDF_[0];
# ifdef _OPENMP
# pragma omp parallel private(idx1, myRDF)
{
//mprintf("OPENMP: %i threads\n",omp_get_num_threads());
myRDF = &(rdf_thread_[omp_get_thread_num()][0]);
# pragma omp for
# endif
for (idx1 = 0; idx1 < outer_max; idx1++) {
const double* xyz1 = frmIn.XYZ( OuterMask_[idx1] );
for (int idx2 = 0; idx2 < inner_max; idx2++) {
const double* xyz2 = frmIn.XYZ( InnerMask_[idx2] );
double D2 = DIST2( imageOpt_.ImagingType(), xyz1, xyz2, frmIn.BoxCrd() );
if (D2 <= maximum2_) {
// NOTE: Can we modify the histogram to store D^2?
double dist = sqrt(D2);
//mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D);
int idx = (int) (dist * one_over_spacing_);
//if (idx > -1 && idx < numBins_) {
myRDF[idx]++;
//}
}
} // END inner loop
} // END outer loop
# ifdef _OPENMP
} // END pragma omp parallel
# endif
}

// Action_Radial::DoAction()
/** Calculate distances from atoms in mask1 to atoms in mask 2 and
* bin them.
Expand All @@ -458,41 +557,40 @@ Action::RetType Action_Radial::DoAction(int frameNum, ActionFrame& frm) {
if (useVolume_)
volume_ += frm.Frm().BoxCrd().CellVolume();
// ---------------------------------------------
if ( rmode_ == NORMAL ) {
if ( rmode_ == NORMAL ) {
#ifdef CUDA
// Copy atoms for GPU
std::vector<CpptrajGpu::FpType> outerxyz = mask_to_xyz(OuterMask_, frm.Frm());
const CpptrajGpu::FpType* outerxyzPtr = &outerxyz[0];
std::vector<CpptrajGpu::FpType> innerxyz;
const CpptrajGpu::FpType* innerxyzPtr = 0;
if (!mask2_is_mask1_) {
innerxyz = mask_to_xyz(InnerMask_, frm.Frm());
innerxyzPtr = &innerxyz[0];
}
CpptrajGpu::FpType gpu_box[3];
gpu_box[0] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::X);
gpu_box[1] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::Y);
gpu_box[2] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().Param(Box::Z);
CpptrajGpu::FpType gpu_ucell[9], gpu_frac[9];
for (int ibox = 0; ibox != 9; ibox++) {
gpu_ucell[ibox] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().UnitCell()[ibox];
gpu_frac[ibox] = (CpptrajGpu::FpType)frm.Frm().BoxCrd().FracCell()[ibox];
}
Cpptraj_GPU_RDF( &RDF_[0], RDF_.size(), maximum2_, one_over_spacing_,
outerxyzPtr, OuterMask_.Nselected(),
innerxyzPtr, InnerMask_.Nselected(),
imageOpt_.ImagingType(),
gpu_box,
gpu_ucell,
gpu_frac );
#else /* CUDA */
// Calculation of all atoms in Mask1 to all atoms in Mask2
int outer_max = OuterMask_.Nselected();
int inner_max = InnerMask_.Nselected();
# ifdef _OPENMP
# pragma omp parallel private(nmask1,nmask2,atom1,atom2,D,idx,mythread)
{
//mprintf("OPENMP: %i threads\n",omp_get_num_threads());
mythread = omp_get_thread_num();
# pragma omp for
# endif
for (nmask1 = 0; nmask1 < outer_max; nmask1++) {
atom1 = OuterMask_[nmask1];
for (nmask2 = 0; nmask2 < inner_max; nmask2++) {
atom2 = InnerMask_[nmask2];
if (atom1 != atom2) {
D = DIST2( imageOpt_.ImagingType(), frm.Frm().XYZ(atom1), frm.Frm().XYZ(atom2), frm.Frm().BoxCrd() );
if (D <= maximum2_) {
// NOTE: Can we modify the histogram to store D^2?
D = sqrt(D);
//mprintf("MASKLOOP: %10i %10i %10.4f\n",atom1,atom2,D);
idx = (int) (D * one_over_spacing_);
if (idx > -1 && idx < numBins_)
# ifdef _OPENMP
++rdf_thread_[mythread][idx];
# else
++RDF_[idx];
# endif
}
}
} // END loop over 2nd mask
} // END loop over 1st mask
# ifdef _OPENMP
} // END pragma omp parallel
# endif
if (mask2_is_mask1_)
calcRDF_singleMask( frm.Frm() );
else
calcRDF_twoMask( frm.Frm() );
#endif /* CUDA */
// ---------------------------------------------
} else if ( rmode_ == NO_INTRAMOL ) {
// Calculation of all atoms in Mask1 to all atoms in Mask2, ignoring
Expand Down
5 changes: 5 additions & 0 deletions src/Action_Radial.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class Action_Radial: public Action {
# endif
typedef std::vector<unsigned long> Iarray;

void calcRDF_singleMask(Frame const&);

void calcRDF_twoMask(Frame const&);

# ifdef _OPENMP
bool threadsCombined_; ///< True if CombineRdfThreads() has been called.
# endif
Expand All @@ -43,6 +47,7 @@ class Action_Radial: public Action {
Topology* currentParm_; ///< Current topology, needed for NO_INTERMOL
int intramol_distances_; ///< # of intra-molecular distances for NO_INTERMOL.
bool useVolume_; ///< If true normalize based on input volume.
bool mask2_is_mask1_; ///< True is mask1 and mask2 are the same.
double volume_; ///< Hold sum of volume for averaging.
double maximum2_; ///< Largest distance squared that can be binned.
double spacing_; ///< Bin spacing.
Expand Down
11 changes: 11 additions & 0 deletions src/Cpptraj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Cpptraj.h"
#include "CpptrajStdio.h"
#include "Command.h"
#include "Gpu.h"
#include "ReadLine.h"
#include "Version.h"
#include "ParmFile.h" // ProcessMask
Expand Down Expand Up @@ -122,6 +123,16 @@ void Cpptraj::Intro() const {
mprintf("| CUDA device: %s\n", deviceProp.name);
mprintf("| Available GPU memory: %s\n",
ByteString(deviceProp.totalGlobalMem, BYTE_DECIMAL).c_str());
mprintf("| Compute capability: %i.%i\n", deviceProp.major, deviceProp.minor);
CpptrajGpu::SetComputeVersion( deviceProp.major );
if (State_.Debug() > 0) {
mprintf("| Max threads/block: %i\n", deviceProp.maxThreadsPerBlock);
mprintf("| Max threads dim: %i %i %i\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
mprintf("| Max threads/multiprocessor: %i\n", deviceProp.maxThreadsPerMultiProcessor);
}
}
}
# endif
Expand Down
15 changes: 15 additions & 0 deletions src/Gpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "Gpu.h"

static unsigned int cpptraj_gpu_major_ = 0;

void CpptrajGpu::SetComputeVersion(int major) {
if (major > -1)
cpptraj_gpu_major_ = major;
}

unsigned int CpptrajGpu::MaxBlockDim_2D() {
if (cpptraj_gpu_major_ < 4)
return 16;
else
return 32;
}
11 changes: 11 additions & 0 deletions src/Gpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef INC_CPPTRAJ_GPU_H
#define INC_CPPTRAJ_GPU_H
namespace CpptrajGpu {
/// Set the major CUDA compute version number
void SetComputeVersion(int);
/// \return Max block dimensions if using 2D blocks
unsigned int MaxBlockDim_2D();
/// Default floating point type
typedef float FpType;
}
#endif
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ noarpack:
@echo "Skipping bundled ARPACK build"
@echo ""

cuda_kernels/libcpptraj_cuda.a:
cuda_kernels/libcpptraj_cuda.a::
cd cuda_kernels && $(MAKE) all

# Dependency targets
Expand Down
2 changes: 1 addition & 1 deletion src/Version.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Whenever a number that precedes <revision> is incremented, all subsequent
* numbers should be reset to 0.
*/
#define CPPTRAJ_INTERNAL_VERSION "V6.10.0"
#define CPPTRAJ_INTERNAL_VERSION "V6.11.0"
/// PYTRAJ relies on this
#define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION
#endif
Loading