Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
26ffe7e
Start adding scripts for benchmarking clustering
drroe Jan 13, 2022
c002dfe
Determine number of trajin needed
drroe Jan 13, 2022
4a3a82d
Finish up cluster script
drroe Jan 13, 2022
3aa5cca
Add script to extract timings
drroe Jan 13, 2022
ba431e1
Move to correct dir
drroe Jan 13, 2022
ddbbf7b
Reduce # frames for initial testing
drroe Jan 13, 2022
432871f
Dramatically improve cluster Summary execution time by creating an array
drroe Jan 14, 2022
bea29b6
Have ClusterDist take boolean array of frame is present
drroe Jan 14, 2022
ca37bca
Remove some debug info
drroe Jan 17, 2022
055376c
Benchmark silhouette
drroe Jan 17, 2022
bdb11d7
Use frameIsPresent arrays in silhouette calc
drroe Jan 17, 2022
a234fa3
Clean up and better-document the Sieve class
drroe Jan 17, 2022
560d837
Rewrite the frame is present array as being present inside Sieve, this
drroe Jan 17, 2022
b98b867
Benchmark the bestreps calc
drroe Jan 17, 2022
cfc493f
Add a debug number of frames
drroe Jan 18, 2022
608f201
Add some debug print, commented out
drroe Jan 18, 2022
535450c
Use frameIsPresent array instead of HasFrame. Make it a warning if no
drroe Jan 18, 2022
5ec0ca5
Pass in the FrameIsPresent array for bestreps calc. Add part to Node for
drroe Jan 18, 2022
fe84b94
Get rid of FIXME, fixed
drroe Jan 18, 2022
66c2534
Start new Silhouette class
drroe Jan 18, 2022
ac59abe
Add Silhouette class
drroe Jan 18, 2022
033bb50
Silhouette calc is now in a separate class
drroe Jan 18, 2022
8111d33
Silhouette write moved to Silhouette
drroe Jan 18, 2022
9f9ea05
Use new Silhouette calc
drroe Jan 18, 2022
92f6d1a
Add missing depend for <list>
drroe Jan 18, 2022
acee551
Write out actual frame numbers for cluster silhouette for each frame.
drroe Jan 18, 2022
9cfdd9f
Add option to choose either the sorted silhouette index or frame number
drroe Jan 18, 2022
80a0fc8
Sorted frame indices have been fixed for silhouette
drroe Jan 18, 2022
c53f557
Minor bump to 6.2.0. Fixes for post-clustering speed (summary etc), fix
drroe Jan 18, 2022
355a39c
Add Silhouette.cpp to CMakeLists.txt
drroe Jan 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions devtools/benchmark/cluster/cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash

CPPTRAJ=`which cpptraj`
#export OMP_NUM_THREADS=12
#CPPTRAJ=`which cpptraj.OMP`

TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7
TRJ=~/Cpptraj/Cpptraj-ExtendedTests/run9.nc # 2000 frames
COUNT=2000

# Each trajectory is 100000 frames
if [ ! -f '../framelist.sh' ] ; then
echo "no ../framelist.sh"
exit 1
fi
source ../framelist.sh

for TOTAL in $FRAMELIST ; do
if [ -f 'CpptrajPairDist' ] ; then
rm CpptrajPairDist
fi
cat > cpptraj.in <<EOF
set PREFIX = test1
parm $TOP
EOF
# Set trajin
echo " ===== $TOTAL ===== "
if [ $TOTAL -le $COUNT ] ; then
echo "trajin $TRJ 1 $TOTAL 1" >> cpptraj.in
else
((NTRAJ = $TOTAL / $COUNT))
((REM = $TOTAL % $COUNT))
if [ $REM -gt 0 ] ; then
((NTRAJ++))
fi
echo "$NTRAJ $REM"
for (( i=1; i < $NTRAJ; i++ )) ; do
echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in
done
if [ $REM -gt 0 ] ; then
echo "trajin $TRJ 1 $REM 1" >> cpptraj.in
else
echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in
fi
fi

# Get number of frames we want from any given trajectory
((NFRAMES = $TOTAL / 4))
# Calculate split frames
((SPLIT1 = $NFRAMES * 1))
((SPLIT2 = $NFRAMES * 2))
((SPLIT3 = $NFRAMES * 3))
echo "Total $TOTAL, Each section $NFRAMES splits $SPLIT1 $SPLIT2 $SPLIT3"

cat >> cpptraj.in <<EOF

cluster \
kmeans clusters 5 randompoint kseed 42 \
rms :10-260@CA sieve 20 random sieveseed 23 savepairdist \
summaryhalf \$PREFIX.split.dat splitframe $SPLIT1,$SPLIT2,$SPLIT3 \
info \$PREFIX.info.dat \
summary \$PREFIX.summary.dat \
sil \$PREFIX.Sil \
cpopvtime \$PREFIX.cpop.agr normframe \
bestrep cumulative_nosieve
EOF
$CPPTRAJ -i cpptraj.in -o cpptraj.$TOTAL.out
done
6 changes: 6 additions & 0 deletions devtools/benchmark/cluster/framelist.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# source this

FRAMELIST0="1000 2000 3000 4000 5000 6000 7000 8000 9000 10000"
#FRAMELIST1="20000 30000 40000 50000"
FRAMELIST="$FRAMELIST0 $FRAMELIST1"
#FRAMELIST='3000'
18 changes: 18 additions & 0 deletions devtools/benchmark/cluster/plot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash


rm time.pw.dat time.cluster.dat time.restore.dat time.total.dat time.summary.dat \
time.calcoutput.dat time.bestrep.dat
source ../framelist.sh
for TOTAL in $FRAMELIST ; do
OUTFILE=cpptraj.$TOTAL.out
grep TIME $OUTFILE | awk -v nframes=$TOTAL '{
if ($2 == "Pairwise") printf("%i %f\n", nframes, $5) >> "time.pw.dat";
if ($2 == "Clustering") printf("%i %f\n", nframes, $4) >> "time.cluster.dat";
if ($4 == "restore") printf("%i %f\n", nframes, $6) >> "time.restore.dat";
if ($2 == "Analyses") printf("%i %f\n", nframes, $4) >> "time.total.dat";
if ($2 == "Summary") printf("%i %f\n", nframes, $5) >> "time.summary.dat";
if ($2 == "Output") printf("%i %f\n", nframes, $5) >> "time.calcoutput.dat";
if ($2 == "Find") printf("%i %f\n", nframes, $6) >> "time.bestrep.dat";
}'
done
31 changes: 28 additions & 3 deletions doc/cpptraj.lyx
Original file line number Diff line number Diff line change
Expand Up @@ -40389,11 +40389,11 @@ cluster
\end_layout

\begin_layout LyX-Code
[clustersvtime <file> [cvtwindow <#>]] [sil <prefix>] [metricstats <file>]
[clustersvtime <file> [cvtwindow <#>]] [sil <prefix> [silidx {idx|frm}]]
\end_layout

\begin_layout LyX-Code
[cpopvtime <file> [{normpop|normframe}]] [lifetime]
[metricstats <file>] [cpopvtime <file> [{normpop|normframe}]] [lifetime]
\end_layout

\begin_layout LyX-Code
Expand Down Expand Up @@ -41378,6 +41378,25 @@ uster.dat' and cluster silhouette value for each individual frame to '<prefix>.f
me.dat'.
\end_layout

\begin_deeper
\begin_layout Description
[silidx
\begin_inset space ~
\end_inset

{idx|frm}] Choose what indices to write to the cluster silhouette frame
file:
\series bold
idx
\series default
(the default) specifies the sorted index (starting from 0),
\series bold
frm
\series default
specifies the actual frame number.
\end_layout

\end_deeper
\begin_layout Description
[metricstats
\begin_inset space ~
Expand Down Expand Up @@ -41791,7 +41810,13 @@ The cluster silhouette is a measure of how well each point fits within a
Values of -1 indicate the point is dissimilar and may fit better in a neighbori
ng cluster.
Values of 0 indicate the point is on a border between two clusters.

The file containing the cluster silhouette value for each frame can be
viewed directly with the XMGRACE plotting program using the default (sorted
by silhouette value) index ('
\series bold
silidx idx
\series default
').
\end_layout

\begin_layout Subsubsection*
Expand Down
3 changes: 2 additions & 1 deletion src/Cluster/Algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
double Cpptraj::Cluster::Algorithm::ClusterDistance(Node const& C1, Node const& C2,
MetricArray& pmatrix,
bool includeSieved,
Cframes const& sievedOut) const
std::vector<bool> const& frameIsPresent)
const
{
if (C1.Cent().empty() || C2.Cent().empty()) {
mprinterr("Internal Error: One or both centroids are null in ClusterDistance().\n");
Expand Down
3 changes: 2 additions & 1 deletion src/Cluster/Algorithm.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#ifndef INC_CLUSTER_ALGORITHM_H
#define INC_CLUSTER_ALGORITHM_H
#include <vector>
class ArgList;
class CpptrajFile;
namespace Cpptraj {
Expand Down Expand Up @@ -27,7 +28,7 @@ class Algorithm {
virtual void Timing(double) const = 0;
/// /return Algorithm-specific between-cluster distance. Default to centroid distance.
virtual double ClusterDistance(Node const&, Node const&, MetricArray&,
bool, Cframes const&) const;
bool, std::vector<bool> const&) const;
/// /return Epsilon for density-based algorithms; intended for use with sieve restore.
virtual double Epsilon() const { return 0.0; }
// -------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions src/Cluster/Algorithm_DPeaks.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cmath> // fabs
#include <algorithm> // sort
#include <list>
#include "Algorithm_DPeaks.h"
#include "List.h"
#include "MetricArray.h"
Expand Down
7 changes: 4 additions & 3 deletions src/Cluster/Algorithm_HierAgglo.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <limits> // double max
#include <list>
#include "Algorithm_HierAgglo.h"
#include "MetricArray.h"
#include "Node.h"
Expand Down Expand Up @@ -346,7 +347,7 @@ void Cpptraj::Cluster::Algorithm_HierAgglo::calcAvgDist(List::cluster_it& C1_it,
double Cpptraj::Cluster::Algorithm_HierAgglo::ClusterDistance(Node const& C1, Node const& C2,
MetricArray& pmatrix,
bool includeSieved,
Cframes const& sievedOut)
std::vector<bool> const& frameIsPresent)
const
{
double dval = -1.0;
Expand Down Expand Up @@ -376,10 +377,10 @@ const
// No sieved frames included.
for (Node::frame_iterator f1 = C1.beginframe(); f1 != C1.endframe(); ++f1)
{
if (!sievedOut.HasFrame(*f1)) {
if (frameIsPresent[*f1]) {
for (Node::frame_iterator f2 = C2.beginframe(); f2 != C2.endframe(); ++f2)
{
if (!sievedOut.HasFrame(*f2)) {
if (frameIsPresent[*f2]) {
double Dist = pmatrix.Frame_Distance(*f1, *f2);
//mprintf("\t\t\tFrame %i to frame %i = %f\n",*c1frames,*c2frames,Dist);
switch (linkage_) {
Expand Down
2 changes: 1 addition & 1 deletion src/Cluster/Algorithm_HierAgglo.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Algorithm_HierAgglo : public Algorithm {
int DoClustering(List&, Cframes const&, MetricArray&);
void Timing(double) const;
double ClusterDistance(Node const&, Node const&, MetricArray&,
bool, Cframes const&) const;
bool, std::vector<bool> const&) const;
private:
void buildInitialClusters(List&, Cframes const&, MetricArray&);
//void InitializeClusterDistances();
Expand Down
1 change: 1 addition & 0 deletions src/Cluster/Algorithm_Kmeans.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <list>
#include "Algorithm_Kmeans.h"
#include "Cframes.h"
#include "List.h"
Expand Down
55 changes: 39 additions & 16 deletions src/Cluster/BestReps.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include <list>
#include <vector>
#include "BestReps.h"
#include "List.h"
#include "MetricArray.h"
Expand Down Expand Up @@ -61,7 +63,7 @@ void Cpptraj::Cluster::BestReps::PrintBestReps(Node const& node) {

/** Find best representative frames for each cluster. */
int Cpptraj::Cluster::BestReps::FindBestRepFrames(List& clusters, MetricArray& pmatrix,
Cframes const& sievedFrames)
std::vector<bool> const& frameIsPresent)
const
{
int err = 0;
Expand All @@ -73,7 +75,7 @@ const
err = FindBestRepFrames_Centroid(clusters, pmatrix);
break;
case CUMULATIVE_NOSIEVE:
err = FindBestRepFrames_NoSieve_CumulativeDist(clusters, pmatrix, sievedFrames);
err = FindBestRepFrames_NoSieve_CumulativeDist(clusters, pmatrix, frameIsPresent);
break;
case NO_REPS:
mprintf("Warning: Skipping best representative frame calc.\n");
Expand All @@ -94,7 +96,7 @@ const

/** Find best representative frames for given node. */
int Cpptraj::Cluster::BestReps::FindBestRepFrames(Node& node, MetricArray& pmatrix,
Cframes const& sievedFrames)
std::vector<bool> const& frameIsPresent)
const
{
int err = 0;
Expand All @@ -106,7 +108,7 @@ const
err = FindBestRepFrames_Centroid(node, pmatrix);
break;
case CUMULATIVE_NOSIEVE:
err = FindBestRepFrames_NoSieve_CumulativeDist(node, pmatrix, sievedFrames);
err = FindBestRepFrames_NoSieve_CumulativeDist(node, pmatrix, frameIsPresent);
break;
case NO_REPS:
mprintf("Warning: Skipping best representative frame calc.\n");
Expand Down Expand Up @@ -229,28 +231,42 @@ const
*/
int Cpptraj::Cluster::BestReps::
FindBestRepFrames_NoSieve_CumulativeDist(Node& node, MetricArray& pmatrix,
Cframes const& sievedFrames)
std::vector<bool> const& frameIsPresent)
const
{
int err = 0;
RepMap bestReps;
// npresent will be used to count how many frames from 'node' are actually
// present. If all frames in node are not present (i.e. were sieved out)
// this is not actually an error.
int npresent = 0;
for (Node::frame_iterator f1 = node.beginframe(); f1 != node.endframe(); ++f1)
{
if (!sievedFrames.HasFrame( *f1 )) {
if (frameIsPresent[ *f1 ]) {
npresent++;
double cdist = 0.0;
for (Node::frame_iterator f2 = node.beginframe(); f2 != node.endframe(); ++f2)
{
if (f1 != f2 && !sievedFrames.HasFrame( *f2 ))
if (f1 != f2 && frameIsPresent[ *f2 ])
//cdist += pmatrix.Cache().CachedDistance(*f1, *f2); // TODO benchmark the two ways
cdist += pmatrix.Frame_Distance(*f1, *f2);
}
SaveBestRep(bestReps, RepPair(cdist, *f1), nToSave_);
}
}
if (bestReps.empty()) {
mprinterr("Error: Could not determine represenative frame for cluster %i\n",
node.Num());
err++;
if (npresent == 0) {
if (node.Part() == -1)
mprintf("Warning: Could not determine representative frame for cluster %i\n", node.Num());
else
mprintf("Warning: Could not determine representative frame for cluster %i part %i\n",
node.Num(), node.Part());
mprintf("Warning: All frames in cluster were sieved out.\n");
} else {
mprinterr("Error: Could not determine representative frame by cumulative distance\n"
"Error: (ignoring sieved frames) for cluster %i\n", node.Num());
err++;
}
}
SetBestRepFrame( node, bestReps );
return err;
Expand All @@ -262,14 +278,21 @@ const
*/
int Cpptraj::Cluster::BestReps::
FindBestRepFrames_NoSieve_CumulativeDist(List& clusters, MetricArray& pmatrix,
Cframes const& sievedFrames)
std::vector<bool> const& frameIsPresent)
const
{
if (sievedFrames.size() > 0)
mprintf("Warning: Ignoring sieved frames while looking for best representative.\n");
int err = 0;
for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) {
err += FindBestRepFrames_NoSieve_CumulativeDist(*node, pmatrix, sievedFrames);
if (frameIsPresent.empty()) {
mprinterr("Error: Requested no sieved frames while looking for best representative\n"
"Error: but frameIsPresent array is empty.\n");
for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) {
err += FindBestRepFrames_CumulativeDist(*node, pmatrix);
}
} else {
mprintf("Warning: Ignoring sieved frames while looking for best representative.\n");
for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) {
err += FindBestRepFrames_NoSieve_CumulativeDist(*node, pmatrix, frameIsPresent);
}
}
return err;
}
Expand All @@ -292,7 +315,7 @@ const
SaveBestRep(bestReps, RepPair(dist, *f1), nToSave_);
}
if (bestReps.empty()) {
mprinterr("Error: Could not determine represenative frame for cluster %i\n",
mprinterr("Error: Could not determine represenative frame by centroid for cluster %i\n",
node.Num());
err++;
}
Expand Down
9 changes: 4 additions & 5 deletions src/Cluster/BestReps.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <map>
namespace Cpptraj {
namespace Cluster {
class Cframes;
class List;
class MetricArray;
class Node;
Expand All @@ -18,9 +17,9 @@ class BestReps {
/// Initialize best rep frames search with method type, # to save, and debug level
int InitBestReps(RepMethodType, int, int);
/// Find best rep frames for each cluster in given list
int FindBestRepFrames(List&, MetricArray&, Cframes const&) const;
int FindBestRepFrames(List&, MetricArray&, std::vector<bool> const&) const;
/// Find the best rep frames for given node
int FindBestRepFrames(Node&, MetricArray&, Cframes const&) const;
int FindBestRepFrames(Node&, MetricArray&, std::vector<bool> const&) const;
private:
/// Print best reps to stdout
static void PrintBestReps(Node const&);
Expand All @@ -40,9 +39,9 @@ class BestReps {
/// Find best representative frames by shortest distance to all other frames.
int FindBestRepFrames_CumulativeDist(List&, MetricArray&) const;
/// Find best rep frames in node by shortest distance, ignoring sieved frames.
int FindBestRepFrames_NoSieve_CumulativeDist(Node&, MetricArray&, Cframes const&) const;
int FindBestRepFrames_NoSieve_CumulativeDist(Node&, MetricArray&, std::vector<bool> const&) const;
/// Find best representative frames by shortest distance, ignoring sieved frames.
int FindBestRepFrames_NoSieve_CumulativeDist(List&, MetricArray&, Cframes const&) const;
int FindBestRepFrames_NoSieve_CumulativeDist(List&, MetricArray&, std::vector<bool> const&) const;
/// Find best rep frames in node by shortest distance to centroid.
int FindBestRepFrames_Centroid(Node&, MetricArray&) const;
/// Find best representative frames by shortest distance to centroid.
Expand Down
1 change: 1 addition & 0 deletions src/Cluster/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ set(CLUSTER_SOURCES
Output.cpp
Results_Coords.cpp
Sieve.cpp
Silhouette.cpp
)

#------------------------------------------------------------------------------------------
Expand Down
Loading