Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e0e94f8
Start experimental command to compare clusters
drroe Jan 24, 2022
9a0d0c0
Report some stats
drroe Jan 24, 2022
dbbfc7b
Use time instead of wall time to seed RNG. Protect against int overflow
drroe Jan 25, 2022
b1b3b0e
Use wallclock time again, finer grained so less likelyhood of rapid runs
drroe Jan 26, 2022
2e6fd7c
Add missing letter
drroe Jan 26, 2022
993234a
Put DBI in separate file
drroe Jan 27, 2022
60c0098
Put pseudo F calc in separate file
drroe Jan 27, 2022
03d50ea
Take DBI/pseudo-F calc out of list and store those metrics in Control,
drroe Jan 27, 2022
a148105
Allow cpptraj to be set via env var
drroe Jan 27, 2022
fbc13e9
Add data sets for DBI and pseudo-F
drroe Jan 27, 2022
effc938
Store SSR/SST value in set. Update manual.
drroe Jan 27, 2022
813a3e9
Fix TIMER compile
drroe Jan 27, 2022
6aad510
Start exploring a different way of speeding up finding cluster to
drroe Jan 28, 2022
2dd9db6
FindMin needs to ensure iOut < jOut
drroe Jan 28, 2022
f2be4d3
Disable some debug info
drroe Jan 28, 2022
44073b6
Bench hieragglo
drroe Jan 28, 2022
ed8c66c
Remove old code and openmp code. Finding min is no longer the bottleneck
drroe Jan 28, 2022
f411dea
Check openmp
drroe Jan 28, 2022
abb8c01
Remove debug info
drroe Jan 31, 2022
68f24a8
Merge branch 'master' into cluster.work
drroe Jan 31, 2022
33399a4
Update copyright year. Add Franz and Klaus to contributor list.
drroe Jan 31, 2022
53a7926
Version 6.2.3. Revision bump for HA cluster speed improvement and DBI…
drroe Jan 31, 2022
5946e22
Test cluster metrics write out
drroe Jan 31, 2022
c58e003
Allocate cluster metric sets last to try to fix pytraj
drroe Jan 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ CPPTRAJ see the following publication:
Disclaimer and Copyright
========================

CPPTRAJ is Copyright (c) 2010-2021 Daniel R. Roe.
CPPTRAJ is Copyright (c) 2010-2022 Daniel R. Roe.
The terms for using, copying, modifying, and distributing CPPTRAJ are
specified in the file LICENSE.

Expand Down Expand Up @@ -202,6 +202,9 @@ Fixes and improvements to nucleic acid dihedral angle definitions (DihedralSearc
* David S. Cerutti (Rutgers University, Piscataway, NJ, USA)
Original code for the 'xtalsymm' Action.

* Franz Waibl & Klaus R. Liedl (Department of General, Inorganic, and Theoretical Chemistry, University of Innsbruck)
Improvements and enhancements for GIST.

#### Various Contributions
* David A. Case (Rutgers University, Piscataway, NJ, USA)
* Hai Nguyen (Rutgers University, Piscataway, NJ, USA)
Expand Down
11 changes: 7 additions & 4 deletions devtools/benchmark/cluster/cluster.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/bin/bash

CPPTRAJ=`which cpptraj`
#export OMP_NUM_THREADS=12
#CPPTRAJ=`which cpptraj.OMP`
if [ -z "$CPPTRAJ" ] ; then
#CPPTRAJ=`which cpptraj`
export OMP_NUM_THREADS=4
CPPTRAJ=`which cpptraj.OMP`
fi

TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7
TRJ=~/Cpptraj/Cpptraj-ExtendedTests/run9.nc # 2000 frames
Expand Down Expand Up @@ -52,10 +54,11 @@ EOF
((SPLIT3 = $NFRAMES * 3))
echo "Total $TOTAL, Each section $NFRAMES splits $SPLIT1 $SPLIT2 $SPLIT3"

# kmeans clusters 5 randompoint kseed 42 \
cat >> cpptraj.in <<EOF

cluster \
kmeans clusters 5 randompoint kseed 42 \
hieragglo clusters 5 \
rms :10-260@CA sieve 20 random sieveseed 23 savepairdist \
summaryhalf \$PREFIX.split.dat splitframe $SPLIT1,$SPLIT2,$SPLIT3 \
info \$PREFIX.info.dat \
Expand Down
14 changes: 13 additions & 1 deletion doc/cpptraj.lyx
Original file line number Diff line number Diff line change
Expand Up @@ -41638,6 +41638,18 @@ gracecolor
specified).
\end_layout

\begin_layout Description
<name>[DBI] Hold final Davies-Bouldin index.
\end_layout

\begin_layout Description
<name>[PSF] Hold final pseudo-F value.
\end_layout

\begin_layout Description
<name>[SSRSST] Hold final SSR/SST value.
\end_layout

\begin_layout Description
<name>[NCVT] (
\series bold
Expand Down Expand Up @@ -41861,7 +41873,7 @@ The K-dist plot will be named <prefix>.<k>.dat, with the default prefix being
The K-dist plot usually looks like a curve with an initially steep slope
that gradually decreases.
Around where the initial part of the curve starts to flatten out (indicating
an increas in density) is around where epsilon should be set; minpoints
an increase in density) is around where epsilon should be set; minpoints
is set to whatever <k> was.
It has been suggested that the shape of the K-dist curve doesn't change
too much after Kdist=4, but users are encouraged to experiment.
Expand Down
16 changes: 13 additions & 3 deletions src/Cluster/Algorithm_HierAgglo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ void Cpptraj::Cluster::Algorithm_HierAgglo::Results(CpptrajFile& outfile) const
/** Print clustering timing. */
void Cpptraj::Cluster::Algorithm_HierAgglo::Timing(double total) const {
# ifdef TIMER
time_findMin_.WriteTiming(2, "Find min distance", total);
time_mergeFrames_.WriteTiming(2, "Merge cluster frames", total);
time_calcLinkage_.WriteTiming(2, "Calculate new linkage", total);
time_findMin_.WriteTiming(2, "Find min distance :", total);
time_mergeFrames_.WriteTiming(2, "Merge cluster frames :", total);
time_calcLinkage_.WriteTiming(2, "Calculate new linkage :", total);
# endif
}

Expand Down Expand Up @@ -132,6 +132,11 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters,
ClusterDistances_.SetCdist( C1_it->Num(), C2_it->Num(), dval );
}
}
// DEBUG print closest indices
if (debug_ > 0) {
if (debug_ > 1) ClusterDistances_.PrintElementsSquare();
ClusterDistances_.PrintClosest();
}
//InitializeClusterDistances();
// DEBUG - print initial clusters
if (debug_ > 1)
Expand All @@ -149,6 +154,11 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters,
}
if (clusters.Nclusters() == 1) clusteringComplete = true; // Sanity check
cluster_progress.Update( iterations++ );
// DEBUG print closest indices
if (debug_ > 0) {
if (debug_ > 1) ClusterDistances_.PrintElementsSquare();
ClusterDistances_.PrintClosest();
}
}
mprintf("\tCompleted after %i iterations, %u clusters.\n",iterations,
clusters.Nclusters());
Expand Down
2 changes: 2 additions & 0 deletions src/Cluster/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set(CLUSTER_SOURCES
Cmatrix_Binary.cpp
Cmatrix_NC.cpp
Control.cpp
DBI.cpp
DrawGraph.cpp
DynamicMatrix.cpp
List.cpp
Expand All @@ -23,6 +24,7 @@ set(CLUSTER_SOURCES
Metric_Torsion.cpp
Node.cpp
Output.cpp
PseudoF.cpp
Results_Coords.cpp
Sieve.cpp
Silhouette.cpp
Expand Down
32 changes: 30 additions & 2 deletions src/Cluster/Control.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <list>
#include "Control.h"
#include "DBI.h"
#include "Output.h"
#include "PseudoF.h"
#include "../ArgList.h"
#include "../BufferedLine.h" // For loading info file
#include "../CpptrajStdio.h"
Expand Down Expand Up @@ -44,7 +46,13 @@ Cpptraj::Cluster::Control::Control() :
draw_tol_(0),
draw_maxit_(0),
debug_(0),
metricContribFile_(0)
metricContribFile_(0),
DBITotal_(0),
pseudoF_(0),
SSRSST_(0),
dbi_set_(0),
psf_set_(0),
ssrsst_set_(0)
{}

/** DESTRUCTOR */
Expand Down Expand Up @@ -498,6 +506,14 @@ int Cpptraj::Cluster::Control::SetupClustering(DataSetList const& setsToCluster,
clustersvtimefile->AddDataSet( clustersVtime_ );
}

// DBI and pSF data sets
dbi_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "DBI"));
if (dbi_set_ == 0) return 1;
psf_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "PSF"));
if (psf_set_ == 0) return 1;
ssrsst_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "SSRSST"));
if (ssrsst_set_ == 0) return 1;

return 0;
}

Expand Down Expand Up @@ -803,7 +819,17 @@ int Cpptraj::Cluster::Control::Run() {
clusters_.PrintClusters();
}

// Clustering metrics. Centroids should be up to date.
DBITotal_ = ComputeDBI(clusters_, averageDist_, metrics_);
dbi_set_->Add(0, &DBITotal_);
if (clusters_.Nclusters() > 1) {
pseudoF_ = ComputePseudoF(clusters_, SSRSST_, metrics_, debug_);
psf_set_->Add(0, &pseudoF_);
ssrsst_set_->Add(0, &SSRSST_);
}

// TODO assign reference names
timer_post_.Stop();
}
timer_run_.Stop();
return 0;
Expand All @@ -822,12 +848,14 @@ int Cpptraj::Cluster::Control::Output(DataSetList& DSL) {
}

// Info

if (!suppressInfo_) {
CpptrajFile outfile;
if (outfile.OpenWrite( clusterinfo_ )) return 1;
timer_output_info_.Start();
Output::PrintClustersToFile(outfile, clusters_, *algorithm_, metrics_,
frameSieve_.SieveValue(), frameSieve_.FramesToCluster());
frameSieve_.SieveValue(), frameSieve_.FramesToCluster(),
DBITotal_, averageDist_, pseudoF_, SSRSST_);
timer_output_info_.Stop();
outfile.CloseFile();
}
Expand Down
9 changes: 9 additions & 0 deletions src/Cluster/Control.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,15 @@ class Control {

CpptrajFile* metricContribFile_; ///< If not null, determine how much each metric contributes to total distance

// Various metrics of clustering "goodness"
double DBITotal_; ///< Davies-Bouldin index (DBI)
std::vector<double> averageDist_; ///< Average distance of each cluster to each other cluster (DBI)
double pseudoF_; ///< Pseudo-F (pSF)
double SSRSST_; ///< Sum of squares regression over total sum of squares.
DataSet* dbi_set_; ///< DataSet to store DBI.
DataSet* psf_set_; ///< DataSet to store pseudo-F.
DataSet* ssrsst_set_; ///< DataSet to store SSR/SST.

// Timers
Timer timer_setup_; ///< Run - metric, frames to cluster setup
Timer timer_pairwise_; ///< Run - pairwise caching
Expand Down
46 changes: 46 additions & 0 deletions src/Cluster/DBI.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <list>
#include <string>
#include <vector>
#include "DBI.h"
#include "List.h"
#include "Node.h"
#include "MetricArray.h"

/** The Davies-Bouldin Index (DBI) measures the average similarity between each
* cluster and its most similar one; the smaller the DBI, the better. The DBI
* is defined as the average, for all clusters X, of fred, where fred(X) = max,
* across other clusters Y, of (Cx + Cy)/dXY. Here Cx is the average distance
* from points in X to the centroid, similarly Cy, and dXY is the distance
* between cluster centroids.
* NOTE: To use this, cluster centroids should be fully up-to-date.
*/
double Cpptraj::Cluster::ComputeDBI(List const& clusters, std::vector<double>& averageDist, MetricArray& metricIn)
{
averageDist.clear();
averageDist.reserve( clusters.Nclusters() );
for (List::cluster_iterator C1 = clusters.begincluster(); C1 != clusters.endcluster(); ++C1) {
// Calculate average distance to centroid for this cluster
averageDist.push_back( C1->CalcAvgToCentroid( metricIn ) );
//if (outfile.IsOpen())
// outfile.Printf("#Cluster %i has average-distance-to-centroid %f\n",
// C1->Num(), averageDist.back());
}
double DBITotal = 0.0;
unsigned int nc1 = 0;
for (List::cluster_iterator c1 = clusters.begincluster(); c1 != clusters.endcluster(); ++c1, ++nc1) {
double MaxFred = 0;
unsigned int nc2 = 0;
for (List::cluster_iterator c2 = clusters.begincluster(); c2 != clusters.endcluster(); ++c2, ++nc2) {
if (c1 != c2) {
double Fred = averageDist[nc1] + averageDist[nc2];
Fred /= metricIn.CentroidDist( c1->Cent(), c2->Cent() );
if (Fred > MaxFred)
MaxFred = Fred;
}
}
DBITotal += MaxFred;
}
DBITotal /= (double)clusters.Nclusters();
//if (outfile.IsOpen()) outfile.Printf("#DBI: %f\n", DBITotal);
return DBITotal;
}
11 changes: 11 additions & 0 deletions src/Cluster/DBI.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifndef INC_CLUSTER_DBI_H
#define INC_CLUSTER_DBI_H
namespace Cpptraj {
namespace Cluster {
class List;
class MetricArray;
/// Calculate the Davies-Bouldin index and average distance to centroid for each cluster
double ComputeDBI(List const&, std::vector<double>&, MetricArray&);
}
}
#endif
Loading