diff --git a/devtools/benchmark/cluster/cluster.sh b/devtools/benchmark/cluster/cluster.sh new file mode 100755 index 0000000000..d310814bb3 --- /dev/null +++ b/devtools/benchmark/cluster/cluster.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +CPPTRAJ=`which cpptraj` +#export OMP_NUM_THREADS=12 +#CPPTRAJ=`which cpptraj.OMP` + +TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7 +TRJ=~/Cpptraj/Cpptraj-ExtendedTests/run9.nc # 2000 frames +COUNT=2000 + +# Each trajectory is 100000 frames +if [ ! -f '../framelist.sh' ] ; then + echo "no ../framelist.sh" + exit 1 +fi +source ../framelist.sh + +for TOTAL in $FRAMELIST ; do + if [ -f 'CpptrajPairDist' ] ; then + rm CpptrajPairDist + fi + cat > cpptraj.in <> cpptraj.in + else + ((NTRAJ = $TOTAL / $COUNT)) + ((REM = $TOTAL % $COUNT)) + if [ $REM -gt 0 ] ; then + ((NTRAJ++)) + fi + echo "$NTRAJ $REM" + for (( i=1; i < $NTRAJ; i++ )) ; do + echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in + done + if [ $REM -gt 0 ] ; then + echo "trajin $TRJ 1 $REM 1" >> cpptraj.in + else + echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in + fi + fi + + # Get number of frames we want from any given trajectory + ((NFRAMES = $TOTAL / 4)) + # Calculate split frames + ((SPLIT1 = $NFRAMES * 1)) + ((SPLIT2 = $NFRAMES * 2)) + ((SPLIT3 = $NFRAMES * 3)) + echo "Total $TOTAL, Each section $NFRAMES splits $SPLIT1 $SPLIT2 $SPLIT3" + + cat >> cpptraj.in <> "time.pw.dat"; + if ($2 == "Clustering") printf("%i %f\n", nframes, $4) >> "time.cluster.dat"; + if ($4 == "restore") printf("%i %f\n", nframes, $6) >> "time.restore.dat"; + if ($2 == "Analyses") printf("%i %f\n", nframes, $4) >> "time.total.dat"; + if ($2 == "Summary") printf("%i %f\n", nframes, $5) >> "time.summary.dat"; + if ($2 == "Output") printf("%i %f\n", nframes, $5) >> "time.calcoutput.dat"; + if ($2 == "Find") printf("%i %f\n", nframes, $6) >> "time.bestrep.dat"; + }' +done diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx index c3883e2633..0b4634d98d 100644 --- a/doc/cpptraj.lyx +++ b/doc/cpptraj.lyx @@ -40389,11 +40389,11 @@ cluster \end_layout \begin_layout LyX-Code - [clustersvtime [cvtwindow <#>]] [sil ] [metricstats ] + [clustersvtime [cvtwindow <#>]] [sil [silidx {idx|frm}]] \end_layout \begin_layout LyX-Code - [cpopvtime [{normpop|normframe}]] [lifetime] + [metricstats ] [cpopvtime [{normpop|normframe}]] [lifetime] \end_layout \begin_layout LyX-Code @@ -41378,6 +41378,25 @@ uster.dat' and cluster silhouette value for each individual frame to '.f me.dat'. \end_layout +\begin_deeper +\begin_layout Description +[silidx +\begin_inset space ~ +\end_inset + +{idx|frm}] Choose what indices to write to the cluster silhouette frame + file: +\series bold +idx +\series default + (the default) specifies the sorted index (starting from 0), +\series bold +frm +\series default + specifies the actual frame number. +\end_layout + +\end_deeper \begin_layout Description [metricstats \begin_inset space ~ @@ -41791,7 +41810,13 @@ The cluster silhouette is a measure of how well each point fits within a Values of -1 indicate the point is dissimilar and may fit better in a neighbori ng cluster. Values of 0 indicate the point is on a border between two clusters. - + The file containing the cluster silhouette value for each frame can be + viewed directly with the XMGRACE plotting program using the default (sorted + by silhouette value) index (' +\series bold +silidx idx +\series default +'). \end_layout \begin_layout Subsubsection* diff --git a/src/Cluster/Algorithm.cpp b/src/Cluster/Algorithm.cpp index 7d75a8c813..d90d42f8d4 100644 --- a/src/Cluster/Algorithm.cpp +++ b/src/Cluster/Algorithm.cpp @@ -7,7 +7,8 @@ double Cpptraj::Cluster::Algorithm::ClusterDistance(Node const& C1, Node const& C2, MetricArray& pmatrix, bool includeSieved, - Cframes const& sievedOut) const + std::vector const& frameIsPresent) +const { if (C1.Cent().empty() || C2.Cent().empty()) { mprinterr("Internal Error: One or both centroids are null in ClusterDistance().\n"); diff --git a/src/Cluster/Algorithm.h b/src/Cluster/Algorithm.h index 21297c997a..84460f9a0f 100644 --- a/src/Cluster/Algorithm.h +++ b/src/Cluster/Algorithm.h @@ -1,5 +1,6 @@ #ifndef INC_CLUSTER_ALGORITHM_H #define INC_CLUSTER_ALGORITHM_H +#include class ArgList; class CpptrajFile; namespace Cpptraj { @@ -27,7 +28,7 @@ class Algorithm { virtual void Timing(double) const = 0; /// /return Algorithm-specific between-cluster distance. Default to centroid distance. virtual double ClusterDistance(Node const&, Node const&, MetricArray&, - bool, Cframes const&) const; + bool, std::vector const&) const; /// /return Epsilon for density-based algorithms; intended for use with sieve restore. virtual double Epsilon() const { return 0.0; } // ------------------------------------------- diff --git a/src/Cluster/Algorithm_DPeaks.cpp b/src/Cluster/Algorithm_DPeaks.cpp index 5499fe5517..882e4ac5a1 100644 --- a/src/Cluster/Algorithm_DPeaks.cpp +++ b/src/Cluster/Algorithm_DPeaks.cpp @@ -1,5 +1,6 @@ #include // fabs #include // sort +#include #include "Algorithm_DPeaks.h" #include "List.h" #include "MetricArray.h" diff --git a/src/Cluster/Algorithm_HierAgglo.cpp b/src/Cluster/Algorithm_HierAgglo.cpp index 6ea0c39e6c..39defa273f 100644 --- a/src/Cluster/Algorithm_HierAgglo.cpp +++ b/src/Cluster/Algorithm_HierAgglo.cpp @@ -1,4 +1,5 @@ #include // double max +#include #include "Algorithm_HierAgglo.h" #include "MetricArray.h" #include "Node.h" @@ -346,7 +347,7 @@ void Cpptraj::Cluster::Algorithm_HierAgglo::calcAvgDist(List::cluster_it& C1_it, double Cpptraj::Cluster::Algorithm_HierAgglo::ClusterDistance(Node const& C1, Node const& C2, MetricArray& pmatrix, bool includeSieved, - Cframes const& sievedOut) + std::vector const& frameIsPresent) const { double dval = -1.0; @@ -376,10 +377,10 @@ const // No sieved frames included. for (Node::frame_iterator f1 = C1.beginframe(); f1 != C1.endframe(); ++f1) { - if (!sievedOut.HasFrame(*f1)) { + if (frameIsPresent[*f1]) { for (Node::frame_iterator f2 = C2.beginframe(); f2 != C2.endframe(); ++f2) { - if (!sievedOut.HasFrame(*f2)) { + if (frameIsPresent[*f2]) { double Dist = pmatrix.Frame_Distance(*f1, *f2); //mprintf("\t\t\tFrame %i to frame %i = %f\n",*c1frames,*c2frames,Dist); switch (linkage_) { diff --git a/src/Cluster/Algorithm_HierAgglo.h b/src/Cluster/Algorithm_HierAgglo.h index 1d8ee42fb6..6fc88fa549 100644 --- a/src/Cluster/Algorithm_HierAgglo.h +++ b/src/Cluster/Algorithm_HierAgglo.h @@ -19,7 +19,7 @@ class Algorithm_HierAgglo : public Algorithm { int DoClustering(List&, Cframes const&, MetricArray&); void Timing(double) const; double ClusterDistance(Node const&, Node const&, MetricArray&, - bool, Cframes const&) const; + bool, std::vector const&) const; private: void buildInitialClusters(List&, Cframes const&, MetricArray&); //void InitializeClusterDistances(); diff --git a/src/Cluster/Algorithm_Kmeans.cpp b/src/Cluster/Algorithm_Kmeans.cpp index a1d4e317bf..bac1f927de 100644 --- a/src/Cluster/Algorithm_Kmeans.cpp +++ b/src/Cluster/Algorithm_Kmeans.cpp @@ -1,3 +1,4 @@ +#include #include "Algorithm_Kmeans.h" #include "Cframes.h" #include "List.h" diff --git a/src/Cluster/BestReps.cpp b/src/Cluster/BestReps.cpp index 3db4172a33..e50984cb3d 100644 --- a/src/Cluster/BestReps.cpp +++ b/src/Cluster/BestReps.cpp @@ -1,3 +1,5 @@ +#include +#include #include "BestReps.h" #include "List.h" #include "MetricArray.h" @@ -61,7 +63,7 @@ void Cpptraj::Cluster::BestReps::PrintBestReps(Node const& node) { /** Find best representative frames for each cluster. */ int Cpptraj::Cluster::BestReps::FindBestRepFrames(List& clusters, MetricArray& pmatrix, - Cframes const& sievedFrames) + std::vector const& frameIsPresent) const { int err = 0; @@ -73,7 +75,7 @@ const err = FindBestRepFrames_Centroid(clusters, pmatrix); break; case CUMULATIVE_NOSIEVE: - err = FindBestRepFrames_NoSieve_CumulativeDist(clusters, pmatrix, sievedFrames); + err = FindBestRepFrames_NoSieve_CumulativeDist(clusters, pmatrix, frameIsPresent); break; case NO_REPS: mprintf("Warning: Skipping best representative frame calc.\n"); @@ -94,7 +96,7 @@ const /** Find best representative frames for given node. */ int Cpptraj::Cluster::BestReps::FindBestRepFrames(Node& node, MetricArray& pmatrix, - Cframes const& sievedFrames) + std::vector const& frameIsPresent) const { int err = 0; @@ -106,7 +108,7 @@ const err = FindBestRepFrames_Centroid(node, pmatrix); break; case CUMULATIVE_NOSIEVE: - err = FindBestRepFrames_NoSieve_CumulativeDist(node, pmatrix, sievedFrames); + err = FindBestRepFrames_NoSieve_CumulativeDist(node, pmatrix, frameIsPresent); break; case NO_REPS: mprintf("Warning: Skipping best representative frame calc.\n"); @@ -229,18 +231,23 @@ const */ int Cpptraj::Cluster::BestReps:: FindBestRepFrames_NoSieve_CumulativeDist(Node& node, MetricArray& pmatrix, - Cframes const& sievedFrames) + std::vector const& frameIsPresent) const { int err = 0; RepMap bestReps; + // npresent will be used to count how many frames from 'node' are actually + // present. If all frames in node are not present (i.e. were sieved out) + // this is not actually an error. + int npresent = 0; for (Node::frame_iterator f1 = node.beginframe(); f1 != node.endframe(); ++f1) { - if (!sievedFrames.HasFrame( *f1 )) { + if (frameIsPresent[ *f1 ]) { + npresent++; double cdist = 0.0; for (Node::frame_iterator f2 = node.beginframe(); f2 != node.endframe(); ++f2) { - if (f1 != f2 && !sievedFrames.HasFrame( *f2 )) + if (f1 != f2 && frameIsPresent[ *f2 ]) //cdist += pmatrix.Cache().CachedDistance(*f1, *f2); // TODO benchmark the two ways cdist += pmatrix.Frame_Distance(*f1, *f2); } @@ -248,9 +255,18 @@ const } } if (bestReps.empty()) { - mprinterr("Error: Could not determine represenative frame for cluster %i\n", - node.Num()); - err++; + if (npresent == 0) { + if (node.Part() == -1) + mprintf("Warning: Could not determine representative frame for cluster %i\n", node.Num()); + else + mprintf("Warning: Could not determine representative frame for cluster %i part %i\n", + node.Num(), node.Part()); + mprintf("Warning: All frames in cluster were sieved out.\n"); + } else { + mprinterr("Error: Could not determine representative frame by cumulative distance\n" + "Error: (ignoring sieved frames) for cluster %i\n", node.Num()); + err++; + } } SetBestRepFrame( node, bestReps ); return err; @@ -262,14 +278,21 @@ const */ int Cpptraj::Cluster::BestReps:: FindBestRepFrames_NoSieve_CumulativeDist(List& clusters, MetricArray& pmatrix, - Cframes const& sievedFrames) + std::vector const& frameIsPresent) const { - if (sievedFrames.size() > 0) - mprintf("Warning: Ignoring sieved frames while looking for best representative.\n"); int err = 0; - for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) { - err += FindBestRepFrames_NoSieve_CumulativeDist(*node, pmatrix, sievedFrames); + if (frameIsPresent.empty()) { + mprinterr("Error: Requested no sieved frames while looking for best representative\n" + "Error: but frameIsPresent array is empty.\n"); + for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) { + err += FindBestRepFrames_CumulativeDist(*node, pmatrix); + } + } else { + mprintf("Warning: Ignoring sieved frames while looking for best representative.\n"); + for (List::cluster_it node = clusters.begin(); node != clusters.end(); ++node) { + err += FindBestRepFrames_NoSieve_CumulativeDist(*node, pmatrix, frameIsPresent); + } } return err; } @@ -292,7 +315,7 @@ const SaveBestRep(bestReps, RepPair(dist, *f1), nToSave_); } if (bestReps.empty()) { - mprinterr("Error: Could not determine represenative frame for cluster %i\n", + mprinterr("Error: Could not determine represenative frame by centroid for cluster %i\n", node.Num()); err++; } diff --git a/src/Cluster/BestReps.h b/src/Cluster/BestReps.h index 6d4dfb33bc..b4902e8b55 100644 --- a/src/Cluster/BestReps.h +++ b/src/Cluster/BestReps.h @@ -3,7 +3,6 @@ #include namespace Cpptraj { namespace Cluster { -class Cframes; class List; class MetricArray; class Node; @@ -18,9 +17,9 @@ class BestReps { /// Initialize best rep frames search with method type, # to save, and debug level int InitBestReps(RepMethodType, int, int); /// Find best rep frames for each cluster in given list - int FindBestRepFrames(List&, MetricArray&, Cframes const&) const; + int FindBestRepFrames(List&, MetricArray&, std::vector const&) const; /// Find the best rep frames for given node - int FindBestRepFrames(Node&, MetricArray&, Cframes const&) const; + int FindBestRepFrames(Node&, MetricArray&, std::vector const&) const; private: /// Print best reps to stdout static void PrintBestReps(Node const&); @@ -40,9 +39,9 @@ class BestReps { /// Find best representative frames by shortest distance to all other frames. int FindBestRepFrames_CumulativeDist(List&, MetricArray&) const; /// Find best rep frames in node by shortest distance, ignoring sieved frames. - int FindBestRepFrames_NoSieve_CumulativeDist(Node&, MetricArray&, Cframes const&) const; + int FindBestRepFrames_NoSieve_CumulativeDist(Node&, MetricArray&, std::vector const&) const; /// Find best representative frames by shortest distance, ignoring sieved frames. - int FindBestRepFrames_NoSieve_CumulativeDist(List&, MetricArray&, Cframes const&) const; + int FindBestRepFrames_NoSieve_CumulativeDist(List&, MetricArray&, std::vector const&) const; /// Find best rep frames in node by shortest distance to centroid. int FindBestRepFrames_Centroid(Node&, MetricArray&) const; /// Find best representative frames by shortest distance to centroid. diff --git a/src/Cluster/CMakeLists.txt b/src/Cluster/CMakeLists.txt index c9ad5d9ceb..75edebfd73 100644 --- a/src/Cluster/CMakeLists.txt +++ b/src/Cluster/CMakeLists.txt @@ -25,6 +25,7 @@ set(CLUSTER_SOURCES Output.cpp Results_Coords.cpp Sieve.cpp + Silhouette.cpp ) #------------------------------------------------------------------------------------------ diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index c596b327f3..5da5691689 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -1,3 +1,4 @@ +#include #include "Control.h" #include "Output.h" #include "../ArgList.h" @@ -206,10 +207,10 @@ const char* Cpptraj::Cluster::Control::OutputArgs2_ = "[summarysplit ] [splitframe ]"; const char* Cpptraj::Cluster::Control::OutputArgs3_ = - "[clustersvtime [cvtwindow <#>]] [sil ] [metricstats ]"; + "[clustersvtime [cvtwindow <#>]] [sil [silidx {idx|frm}]]"; const char* Cpptraj::Cluster::Control::OutputArgs4_ = - "[cpopvtime [{normpop|normframe}]] [lifetime]"; + "[metricstats ] [cpopvtime [{normpop|normframe}]] [lifetime]"; const char* Cpptraj::Cluster::Control::GraphArgs_ = "[{drawgraph|drawgraph3d} [draw_tol ] [draw_maxit // std::max #include // double max +#include #include "List.h" #include "MetricArray.h" #include "Node.h" @@ -409,103 +409,3 @@ double Cpptraj::Cluster::List::ComputePseudoF(double& SSRSST, MetricArray& metri return pseudof; } - -/** The cluster silhouette is a measure of how well each point fits within - * a cluster. Values of 1 indicate the point is very similar to other points - * in the cluster, i.e. it is well-clustered. Values of -1 indicate the point - * is dissimilar and may fit better in a neighboring cluster. Values of 0 - * indicate the point is on a border between two clusters. - */ -int Cpptraj::Cluster::List::CalcSilhouette(MetricArray& metrics, - Cframes const& sievedFrames, - bool includeSieved) -{ - for (cluster_it Ci = begin(); Ci != end(); ++Ci) - { - Node::SilPairArray& SiVals = Ci->FrameSilhouettes(); - SiVals.clear(); - SiVals.reserve( Ci->Nframes() ); - double avg_si = 0.0; - int ci_frames = 0; - for (Node::frame_iterator f1 = Ci->beginframe(); f1 != Ci->endframe(); ++f1) - { - if (includeSieved || !sievedFrames.HasFrame( *f1 )) { - // Calculate the average dissimilarity of this frame with all other - // points in this frames cluster. - double ai = 0.0; - int self_frames = 0; - if (includeSieved) { - for (Node::frame_iterator f2 = Ci->beginframe(); f2 != Ci->endframe(); ++f2) - { - if (f1 != f2) { - ai += metrics.Frame_Distance(*f1, *f2); - ++self_frames; - } - } - } else { - for (Node::frame_iterator f2 = Ci->beginframe(); f2 != Ci->endframe(); ++f2) - { - if (f1 != f2 && !sievedFrames.HasFrame(*f2)) { - ai += metrics.Frame_Distance(*f1, *f2); - ++self_frames; - } - } - } - if (self_frames > 0) - ai /= (double)self_frames; - //mprintf("\t\tFrame %i cluster %i ai = %g\n", *f1+1, Ci->Num(), ai); - // Determine lowest average dissimilarity of this frame with all - // other clusters. - double min_bi = std::numeric_limits::max(); - for (cluster_iterator Cj = begincluster(); Cj != endcluster(); ++Cj) - { - if (Ci != Cj) - { - double bi = 0.0; - // NOTE: ASSUMING NO EMPTY CLUSTERS - if (includeSieved) { - for (Node::frame_iterator f2 = Cj->beginframe(); f2 != Cj->endframe(); ++f2) - bi += metrics.Frame_Distance(*f1, *f2); - bi /= (double)Cj->Nframes(); - } else { - int cj_frames = 0; - for (Node::frame_iterator f2 = Cj->beginframe(); f2 != Cj->endframe(); ++f2) - { - if (!sievedFrames.HasFrame(*f2)) { - bi += metrics.Frame_Distance(*f1, *f2); - ++cj_frames; - } - } - bi /= (double)cj_frames; - } - //mprintf("\t\tFrame %i to cluster %i bi = %g\n", *f1 + 1, Cj->Num(), bi); - if (bi < min_bi) - min_bi = bi; - } - } - double max_ai_bi = std::max( ai, min_bi ); - if (max_ai_bi == 0.0) - mprinterr("Error: Divide by zero in silhouette calculation for frame %i\n", *f1 + 1); - else { - double si = (min_bi - ai) / max_ai_bi; - SiVals.push_back( Node::SilPair(*f1, si) ); - avg_si += si; - ++ci_frames; - } - } // END if frame should be calcd - } // END loop over cluster frames - //std::sort( SiVals.begin(), SiVals.end() ); - // DEBUG - if (debug_ > 1) { - mprintf("DEBUG: Cluster frame silhouette values for cluster %i\n", Ci->Num()); - for (Node::SilPairArray::const_iterator it = Ci->FrameSilhouettes().begin(); - it != Ci->FrameSilhouettes().end(); ++it) - mprintf("\t%8i %g\n", it->first+1, it->second); - } - if (ci_frames > 0) - avg_si /= (double)ci_frames; - //mprintf("DEBUG: Cluster silhouette: %8i %g\n", Ci->Num(), avg_si); - Ci->SetSilhouette( avg_si ); - } // END outer loop over clusters - return 0; -} diff --git a/src/Cluster/List.h b/src/Cluster/List.h index 713d9fe976..8492c3281f 100644 --- a/src/Cluster/List.h +++ b/src/Cluster/List.h @@ -1,6 +1,6 @@ #ifndef INC_CLUSTER_LIST_H #define INC_CLUSTER_LIST_H -#include +//#include #include "Cframes.h" class DataSet_integer; namespace Cpptraj { @@ -67,8 +67,6 @@ class List { double ComputeDBI(std::vector&, MetricArray&) const; /// Calculate pseudo-F double ComputePseudoF(double&, MetricArray&) const; - /// Calculate cluster and cluster frame silhouettes TODO data sets - int CalcSilhouette(MetricArray&, Cframes const&, bool); private: typedef std::list Narray; Narray clusters_; ///< Hold all clusters. diff --git a/src/Cluster/Metric_RMS.cpp b/src/Cluster/Metric_RMS.cpp index 47c8eeb01c..9569248a37 100644 --- a/src/Cluster/Metric_RMS.cpp +++ b/src/Cluster/Metric_RMS.cpp @@ -104,6 +104,7 @@ void Cpptraj::Cluster::Metric_RMS::CalculateCentroid(Centroid* centIn, Cframes cent->Cframe() += frm1_; } } + //mprintf("DEBUG: Metric_RMS::CalculateCentroid divide by %zu\n", cframesIn.size()); cent->Cframe().Divide( (double)cframesIn.size() ); //mprintf("\t\tFirst 3 centroid coords (of %i): %f %f %f\n", cent->Cframe().Natom(), // cent->cent->Cframe()[0], cent->Cframe()[1],cent->Cframe()[2]); diff --git a/src/Cluster/Node.cpp b/src/Cluster/Node.cpp index 9cfaf1e83f..a1fb9c7e36 100644 --- a/src/Cluster/Node.cpp +++ b/src/Cluster/Node.cpp @@ -1,4 +1,5 @@ //#include // DBL_MAX +#include #include "Node.h" #include "MetricArray.h" #include "../DataSet_float.h" @@ -6,10 +7,10 @@ // CONSTRUCTOR Cpptraj::Cluster::Node::Node() : - avgSil_(0), eccentricity_(0), refRms_(0), num_(-1), + part_(-1), needsUpdate_(true) {} @@ -25,6 +26,7 @@ Cpptraj::Cluster::Node::Node(MetricArray& Cdist, Cframes const& frameListIn, int bestReps_(1, RepPair(frameListIn.front(), 0.0)), eccentricity_(0.0), num_(numIn), + part_(-1), needsUpdate_(true) { Cdist.NewCentroid( centroids_, frameListIn ); @@ -37,6 +39,7 @@ Cpptraj::Cluster::Node::Node(const Node& rhs) : bestReps_( rhs.bestReps_ ), eccentricity_( rhs.eccentricity_ ), num_( rhs.num_ ), + part_( rhs.part_ ), needsUpdate_( rhs.needsUpdate_ ) { } @@ -45,6 +48,7 @@ Cpptraj::Cluster::Node& Cpptraj::Cluster::Node::operator=(const Node& rhs) { if (&rhs == this) return *this; eccentricity_ = rhs.eccentricity_; num_ = rhs.num_; + part_ = rhs.part_; bestReps_ = rhs.bestReps_; frameList_ = rhs.frameList_; centroids_ = rhs.centroids_; diff --git a/src/Cluster/Node.h b/src/Cluster/Node.h index f2a0ef0a9a..40a503b0ce 100644 --- a/src/Cluster/Node.h +++ b/src/Cluster/Node.h @@ -1,8 +1,8 @@ #ifndef INC_CLUSTER_NODE_H #define INC_CLUSTER_NODE_H -#include -#include // std::pair -#include +//#include +//#include // std::pair +//#include #include "CentroidArray.h" #include "Cframes.h" // Cframes::const_iterator class DataSet_integer; @@ -36,10 +36,6 @@ class Node { typedef std::pair RepPair; /// Used to hold a list of representative frames/scores typedef std::vector RepPairArray; - /// Used to pair frame numbers with silhouette values. - typedef std::pair SilPair; - /// Used to hold list of frame numbers/silhouette values. - typedef std::vector SilPairArray; /// Used to sort clusters by # of frames in cluster inline bool operator<(const Node&) const; @@ -59,6 +55,8 @@ class Node { double Eccentricity() const { return eccentricity_; } /// \return internal cluster number int Num() const { return num_; } + /// \return internal cluster part + int Part() const { return part_; } /// \return number of frames in cluster. int Nframes() const { return (int)frameList_.size(); } /// \return best representative frame number, or -1 if no best rep set. @@ -78,10 +76,6 @@ class Node { bool HasFrame(int f) const { return frameList_.HasFrame(f); } /// Access representative frame list, const RepPairArray const& BestReps() const { return bestReps_; } - /// Access frame silhouette list - SilPairArray const& FrameSilhouettes() const { return frameSil_; } - /// \return cluster silhoueete vaule - double Silhouette() const { return avgSil_; } /// Calculate centroid of members of this cluster. void CalculateCentroid(MetricArray&); @@ -89,12 +83,10 @@ class Node { void AddFrameToCluster(int fnum) { frameList_.push_back( fnum ); } /// Set cluster number (for bookkeeping). void SetNum(int numIn) { num_ = numIn; } + /// Set cluster part (for bookkeeping). + void SetPart(int partIn) { part_ = partIn; } /// Access representative frame list RepPairArray& BestReps() { return bestReps_; } - /// Access frame silhouette list - SilPairArray& FrameSilhouettes() { return frameSil_; } - /// Set cluster silhouette value - void SetSilhouette(double s) { avgSil_ = s; } /// Sort internal frame list void SortFrameList() { frameList_.Sort(); } /// Remove specified frame from cluster if present. @@ -119,11 +111,10 @@ class Node { CentroidArray centroids_; ///< Centroids (1 for each metric) for all frames in this cluster. std::string name_; ///< Cluster name assigned from reference. RepPairArray bestReps_; ///< Hold best representative frames and their score. - SilPairArray frameSil_; ///< Frame silhouette values. - double avgSil_; ///< Average silhouette value for cluster TODO s.d. as well? - double eccentricity_; ///< Maximum distance between any 2 frames. + double eccentricity_; ///< Maximum distance between any 2 frames. // TODO does this need to be stored? double refRms_; ///< Cluster rms to reference (if assigned). int num_; ///< Cluster number, used for bookkeeping. + int part_; ///< What part of trajectory cluster is in (for splitframes) bool needsUpdate_; ///< True if internal metrics need updating (e.g. after frames added). }; // ----- INLINE FUNCTIONS ------------------------------------------------------ diff --git a/src/Cluster/Output.cpp b/src/Cluster/Output.cpp index 65ef5390bc..b084204f30 100644 --- a/src/Cluster/Output.cpp +++ b/src/Cluster/Output.cpp @@ -1,3 +1,5 @@ +#include +#include #include // sqrt #include // sort, max #include "Output.h" @@ -9,6 +11,7 @@ #include "../Matrix.h" #include "../CpptrajFile.h" #include "../CpptrajStdio.h" +#include "../Timer.h" // DEBUG // XMGRACE colors const char* XMGRACE_COLOR[] = { @@ -111,48 +114,6 @@ void Cpptraj::Cluster::Output::PrintClustersToFile(CpptrajFile& outfile, outfile.CloseFile(); } -/// For sorting cluster frame silhouettes by silhouette value. -struct sort_by_sil_val { - typedef Cpptraj::Cluster::Node::SilPair Bpair; - inline bool operator()(Bpair const& p0, Bpair const& p1) - { - if (p0.second == p1.second) - return (p0.first < p1.first); - else - return (p0.second < p1.second); - } -}; - -/** Print cluster silhouette frame values, sorted by silhouette. */ -int Cpptraj::Cluster::Output::PrintSilhouetteFrames(CpptrajFile& Ffile, List const& clusters) -{ - // TODO different ways of writing out cluster frame silhouettes - unsigned int idx = 0; - for (List::cluster_iterator Ci = clusters.begincluster(); - Ci != clusters.endcluster(); ++Ci, ++idx) - { - Ffile.Printf("#C%-6i %10s\n", Ci->Num(), "Silhouette"); - Node::SilPairArray spaTemp = Ci->FrameSilhouettes(); - std::sort( spaTemp.begin(), spaTemp.end(), sort_by_sil_val() ); - for (Node::SilPairArray::const_iterator it = spaTemp.begin(); - it != spaTemp.end(); ++it, ++idx) - Ffile.Printf("%8u %g\n", idx, it->second); - Ffile.Printf("\n"); - } - return 0; -} - -/** Print average cluster silhouette values. */ -int Cpptraj::Cluster::Output::PrintSilhouettes(CpptrajFile& Cfile, List const& clusters) -{ - // TODO is it ok to assume clusters are in order? - Cfile.Printf("%-8s %10s\n", "#Cluster", ""); - for (List::cluster_iterator Ci = clusters.begincluster(); - Ci != clusters.endcluster(); ++Ci) - Cfile.Printf("%8i %g\n", Ci->Num(), Ci->Silhouette()); - return 0; -} - /** Quick pass through clusters to determine max width of cluster names. */ unsigned int Cpptraj::Cluster::Output::DetermineNameWidth(List const& clusters) { @@ -168,8 +129,13 @@ int Cpptraj::Cluster::Output::Summary(CpptrajFile& outfile, List const& clusters Algorithm const& algorithm, MetricArray& pmatrix, bool includeSieved, bool includeSieveCdist, - Cframes const& sievedOut) + std::vector const& frameIsPresent) { +/* Timer t_total; // DEBUG + Timer t_fdist; // DEBUG + Timer t_cdist; // DEBUG + t_total.Start(); // DEBUG + mprintf("DEBUG:\tincludeSieved=%i includeSieveCdist=%i\n", (int)includeSieved, (int)includeSieveCdist);*/ double fmax = (double)pmatrix.Ntotal(); //if (FrameDistances().SieveValue() != 1 && !includeSieveInAvg) // mprintf("Warning: Within cluster average distance (AvgDist) does not include sieved frames.\n"); @@ -187,9 +153,8 @@ int Cpptraj::Cluster::Output::Summary(CpptrajFile& outfile, List const& clusters outfile.Printf(" %*s %8s", nWidth, "Name", "RMS"); } outfile.Printf("\n"); - //Timer t_fdist; // DEBUG - //Timer t_cdist; // DEBUG - //t_cdist.Start(); + +// t_cdist.Start(); // DEBUG // Calculate distances between clusters. Matrix cluster_distances; cluster_distances.resize( 0, clusters.Nclusters() ); @@ -197,9 +162,11 @@ int Cpptraj::Cluster::Output::Summary(CpptrajFile& outfile, List const& clusters for (List::cluster_iterator c2 = c1; c2 != clusters.endcluster(); ++c2) if (c2 != c1) cluster_distances.addElement( algorithm.ClusterDistance( *c1, *c2, pmatrix, - includeSieveCdist, sievedOut ) ); - //t_cdist.Stop(); + includeSieveCdist, + frameIsPresent ) ); +// t_cdist.Stop(); // DEBUG +// t_fdist.Start(); // DEBUG unsigned int idx1 = 0; for (List::cluster_iterator node = clusters.begincluster(); node != clusters.endcluster(); ++node, ++idx1) @@ -239,9 +206,9 @@ int Cpptraj::Cluster::Output::Summary(CpptrajFile& outfile, List const& clusters } } else { for (Node::frame_iterator f1 = node->beginframe(); f1 != node->endframe(); ++f1) { - if (!sievedOut.HasFrame( *f1 )) { + if (frameIsPresent[ *f1 ]) { for (Node::frame_iterator f2 = f1 + 1; f2 != node->endframe(); ++f2) { - if (!sievedOut.HasFrame( *f2 )) { + if (frameIsPresent[ *f2 ]) { double dist = pmatrix.Frame_Distance(*f1, *f2); internalAvg += dist; internalSD += (dist * dist); @@ -279,8 +246,11 @@ int Cpptraj::Cluster::Output::Summary(CpptrajFile& outfile, List const& clusters outfile.Printf(" %*s %8.3f", nWidth, node->Cname().c_str(), node->RefRms()); outfile.Printf("\n"); } // END loop over clusters - //t_cdist.WriteTiming(1, "Between-cluster distance calc."); - //t_fdist.WriteTiming(1, "Within-cluster distance calc."); +/* t_fdist.Stop(); // DEBUG + t_total.Stop(); // DEBUG + t_cdist.WriteTiming(1, "Between-cluster distance calc.", t_total.Total()); // DEBUG + t_fdist.WriteTiming(1, "Within-cluster distance calc.", t_total.Total()); // DEBUG + t_total.WriteTiming(0, "Calc of summary total"); // DEBUG */ return 0; } @@ -291,7 +261,7 @@ void Cpptraj::Cluster::Output::Summary_Part(CpptrajFile& outfile, List const& clusters, BestReps const& findBestReps, MetricArray& pmatrix, - Cframes const& framesToCluster) + std::vector const& frameIsPresent) { // If no split frames were specified, use halfway point. Cframes actualSplitFrames; @@ -383,12 +353,19 @@ void Cpptraj::Cluster::Output::Summary_Part(CpptrajFile& outfile, std::fill( numInPart.begin(), numInPart.end(), 0 ); std::fill( firstFrame.begin(), firstFrame.end(), -1 ); std::vector clusterPart(actualSplitFrames.size() + 1); + // Set cluster number for all parts to the overall cluster number + for (std::vector::iterator cpit = clusterPart.begin(); + cpit != clusterPart.end(); ++cpit) + { + cpit->SetNum( node->Num() ); + cpit->SetPart( cpit - clusterPart.begin() + 1 ); + } // DEBUG //mprintf("\tCluster %i\n",node->num); // Count how many frames are in each part. for (Node::frame_iterator frame1 = node->beginframe(); - frame1 != node->endframe(); - frame1++) + frame1 != node->endframe(); + frame1++) { unsigned int bin = actualSplitFrames.size(); for (unsigned int sf = 0; sf < actualSplitFrames.size(); ++sf) { @@ -414,21 +391,27 @@ void Cpptraj::Cluster::Output::Summary_Part(CpptrajFile& outfile, outfile.Printf(" %8i", *ff); // Print best reps for each part. // TODO handle case when clusters dont have same number best reps - for (std::vector::iterator node = clusterPart.begin(); - node != clusterPart.end(); ++node) + for (std::vector::iterator partNode = clusterPart.begin(); + partNode != clusterPart.end(); ++partNode) { - if (node->Nframes() == 0) { + //mprintf("\tDetermining best representative frames for cluster %i part %li\n", + // node->Num(), partNode-clusterPart.begin()+1); + if (partNode->Nframes() == 0) { outfile.Printf(" %8i", -1); } else { // Since we just created these clusters, ensure centroid is updated - node->CalculateCentroid( pmatrix ); - findBestReps.FindBestRepFrames(*node, pmatrix, framesToCluster); + //mprintf("DEBUG: Calc. centroid for cluster %i part %li\n", node->Num(), partNode-clusterPart.begin()+1); + partNode->CalculateCentroid( pmatrix ); + findBestReps.FindBestRepFrames(*partNode, pmatrix, frameIsPresent); // TODO handle multiple best reps? - //if (node->BestReps().size() < 2) - outfile.Printf(" %8i", node->BestRepFrame()+1); + int partBestRepFrame = partNode->BestRepFrame(); + // If a best rep was actually found, increment by 1 for user frame # + if (partBestRepFrame > -1) + partBestRepFrame++; + outfile.Printf(" %8i", partBestRepFrame); //else { - // for (Node::RepPairArray::const_iterator rep = node->BestReps().begin(); - // rep != node->BestReps().end(); ++rep) + // for (Node::RepPairArray::const_iterator rep = partNode->BestReps().begin(); + // rep != partNode->BestReps().end(); ++rep) // outfile.Printf(" %8i %8.3f", rep->first+1, rep->second); //} } diff --git a/src/Cluster/Output.h b/src/Cluster/Output.h index 2491710327..033e212f53 100644 --- a/src/Cluster/Output.h +++ b/src/Cluster/Output.h @@ -13,12 +13,10 @@ class Output { public: static void PrintClustersToFile(CpptrajFile&, List const&, Algorithm const&, MetricArray&, int, Cframes const&); - static int PrintSilhouetteFrames(CpptrajFile&, List const&); - static int PrintSilhouettes(CpptrajFile&, List const&); static int Summary(CpptrajFile&, List const&, Algorithm const&, MetricArray&, - bool, bool, Cframes const&); + bool, bool, std::vector const&); static void Summary_Part(CpptrajFile&, unsigned int, Cframes const&, List const&, - BestReps const&, MetricArray&, Cframes const&); + BestReps const&, MetricArray&, std::vector const&); private: static unsigned int DetermineNameWidth(List const&); }; diff --git a/src/Cluster/Sieve.cpp b/src/Cluster/Sieve.cpp index aa95b8afeb..776fc149a6 100644 --- a/src/Cluster/Sieve.cpp +++ b/src/Cluster/Sieve.cpp @@ -2,6 +2,13 @@ #include "../DataSet_PairwiseCache.h" #include "../Random.h" +/** CONSTRUCTOR */ +Cpptraj::Cluster::Sieve::Sieve() : + type_(NONE), + sieve_(1) +{} + +/** Determine the sieving type based on the given sieve value. */ void Cpptraj::Cluster::Sieve::DetermineTypeFromSieve( int sieveIn ) { sieve_ = sieveIn; // Determine sieve type from sieve value. @@ -20,6 +27,7 @@ int Cpptraj::Cluster::Sieve::SetFramesToCluster(int sieveIn, std::size_t maxFram // Sanity check. Should never be called with maxFrames < 1 if (maxFrames < 1) return 1; DetermineTypeFromSieve( sieveIn ); + frameIsPresent_.clear(); framesToCluster_.clear(); sievedOut_.clear(); // --------------------------------------------- @@ -88,6 +96,7 @@ int Cpptraj::Cluster::Sieve::SetupFromCache(DataSet_PairwiseCache const& cache, //mprinterr("Error: Cannot setup frames to cluster from empty cache.\n"); return 1; } + frameIsPresent_.clear(); framesToCluster_.clear(); sievedOut_.clear(); DetermineTypeFromSieve( cache.SieveVal() ); @@ -104,3 +113,25 @@ int Cpptraj::Cluster::Sieve::SetupFromCache(DataSet_PairwiseCache const& cache, sievedOut_.push_back( frm ); return 0; } + +/** Clear the sieve. */ +void Cpptraj::Cluster::Sieve::Clear() { + framesToCluster_.clear(); + sievedOut_.clear(); + frameIsPresent_.clear(); + type_ = NONE; + sieve_ = 1; +} + +/** Create an array containing 'true' for frames that are present, 'false' otherwise. + */ +void Cpptraj::Cluster::Sieve::GenerateFrameIsPresentArray() { + if (frameIsPresent_.empty()) { + // Start everything out as false TODO search for max val? + frameIsPresent_.assign( framesToCluster_.size() + sievedOut_.size(), false ); + // Set true for frames to cluster + for (Cframes::const_iterator it = framesToCluster_.begin(); + it != framesToCluster_.end(); ++it) + frameIsPresent_[*it] = true; + } +} diff --git a/src/Cluster/Sieve.h b/src/Cluster/Sieve.h index 6a7878773b..3ad24b0869 100644 --- a/src/Cluster/Sieve.h +++ b/src/Cluster/Sieve.h @@ -9,21 +9,35 @@ namespace Cluster { class Sieve { public: enum SieveType { NONE=0, REGULAR, RANDOM }; - - Sieve() : type_(NONE), sieve_(1) {} - + /// CONSTRUCTOR - default no sieve + Sieve(); + /// Set which frames to cluster based on given sieve value, max# frames, and seed int SetFramesToCluster(int, std::size_t, int); + /// Set which frames to cluster based on whats in the given pairwise cache and max# frames int SetupFromCache( DataSet_PairwiseCache const&, std::size_t ); - void Clear() { framesToCluster_.clear(); sievedOut_.clear(); type_ = NONE; sieve_ = 1; } + /// Clear the Sieve + void Clear(); + /// Create an array containing true for present frames, false otherwise + void GenerateFrameIsPresentArray(); - Cframes const& FramesToCluster() const { return framesToCluster_; } - Cframes const& SievedOut() const { return sievedOut_; } - int SieveValue() const { return sieve_; } + /// \return Array of frames to cluster (i.e. are present) + Cframes const& FramesToCluster() const { return framesToCluster_; } + /// \return Array of frames sieved out (i.e. not present) + Cframes const& SievedOut() const { return sievedOut_; } + /// \return An array containing true for present frames, false otherwise. Empty if not generated. + std::vector const& FrameIsPresent() const { return frameIsPresent_; } + /// \return Sieve value used to set up sieve + int SieveValue() const { return sieve_; } + /// \return Sieve type + SieveType Type() const { return type_; } private: void DetermineTypeFromSieve(int); + typedef std::vector Barray; + Cframes framesToCluster_; ///< Frames to cluster. Cframes sievedOut_; ///< Frames that will not be clustered (i.e. sieved out). + Barray frameIsPresent_; ///< True if frame is present, false otherwise. SieveType type_; ///< Sieveing type. int sieve_; ///< Sieving value. }; diff --git a/src/Cluster/Silhouette.cpp b/src/Cluster/Silhouette.cpp new file mode 100644 index 0000000000..720b6dad33 --- /dev/null +++ b/src/Cluster/Silhouette.cpp @@ -0,0 +1,183 @@ +#include // std::max, std::sort +#include // double max +#include +#include "Silhouette.h" +#include "List.h" +#include "MetricArray.h" +#include "Node.h" +#include "../CpptrajStdio.h" +#include "../CpptrajFile.h" + +using namespace Cpptraj::Cluster; + +/** CONSTRUCTOR - set debug level */ +Silhouette::Silhouette(int dbgIn) : + debug_(dbgIn), + silIdxType_(IDX_NOT_SPECIFIED) +{} + +/** Initialize silhouette calc. */ +int Silhouette::Init(IdxType silIdxTypeIn) { + silIdxType_ = silIdxTypeIn; + return 0; +} + +/** The cluster silhouette is a measure of how well each point fits within + * a cluster. Values of 1 indicate the point is very similar to other points + * in the cluster, i.e. it is well-clustered. Values of -1 indicate the point + * is dissimilar and may fit better in a neighboring cluster. Values of 0 + * indicate the point is on a border between two clusters. + */ +int Silhouette::CalcSilhouette(List const& clusters, + MetricArray& metrics, + std::vector const& frameIsPresent, + bool includeSieved) +{ + clusterFrameSil_.clear(); + clusterAvgSil_.clear(); + // Loop over all clusters + for (List::cluster_iterator Ci = clusters.begincluster(); Ci != clusters.endcluster(); ++Ci) + { + clusterFrameSil_.push_back( SilPairArray() ); + SilPairArray& SiVals = clusterFrameSil_.back(); + SiVals.reserve( Ci->Nframes() ); + double avg_si = 0.0; + int ci_frames = 0; + for (Node::frame_iterator f1 = Ci->beginframe(); f1 != Ci->endframe(); ++f1) + { + if (includeSieved || frameIsPresent[ *f1 ]) { + // Calculate the average dissimilarity of this frame with all other + // points in this frames cluster. + double ai = 0.0; + int self_frames = 0; + if (includeSieved) { + for (Node::frame_iterator f2 = Ci->beginframe(); f2 != Ci->endframe(); ++f2) + { + if (f1 != f2) { + ai += metrics.Frame_Distance(*f1, *f2); + ++self_frames; + } + } + } else { + for (Node::frame_iterator f2 = Ci->beginframe(); f2 != Ci->endframe(); ++f2) + { + if (f1 != f2 && frameIsPresent[ *f2 ]) { + ai += metrics.Frame_Distance(*f1, *f2); + ++self_frames; + } + } + } + if (self_frames > 0) + ai /= (double)self_frames; + //mprintf("\t\tFrame %i cluster %i ai = %g\n", *f1+1, Ci->Num(), ai); + // Determine lowest average dissimilarity of this frame with all + // other clusters. + double min_bi = std::numeric_limits::max(); + for (List::cluster_iterator Cj = clusters.begincluster(); Cj != clusters.endcluster(); ++Cj) + { + if (Ci != Cj) + { + double bi = 0.0; + // NOTE: ASSUMING NO EMPTY CLUSTERS + if (includeSieved) { + for (Node::frame_iterator f2 = Cj->beginframe(); f2 != Cj->endframe(); ++f2) + bi += metrics.Frame_Distance(*f1, *f2); + bi /= (double)Cj->Nframes(); + } else { + int cj_frames = 0; + for (Node::frame_iterator f2 = Cj->beginframe(); f2 != Cj->endframe(); ++f2) + { + if (frameIsPresent[ *f2 ]) { + bi += metrics.Frame_Distance(*f1, *f2); + ++cj_frames; + } + } + bi /= (double)cj_frames; + } + //mprintf("\t\tFrame %i to cluster %i bi = %g\n", *f1 + 1, Cj->Num(), bi); + if (bi < min_bi) + min_bi = bi; + } + } + double max_ai_bi = std::max( ai, min_bi ); + if (max_ai_bi == 0.0) + mprinterr("Error: Divide by zero in silhouette calculation for frame %i\n", *f1 + 1); + else { + double si = (min_bi - ai) / max_ai_bi; + SiVals.push_back( SilPair(*f1, si) ); + avg_si += si; + ++ci_frames; + } + } // END if frame should be calcd + } // END loop over cluster frames + //std::sort( SiVals.begin(), SiVals.end() ); + // DEBUG + if (debug_ > 1) { + mprintf("DEBUG: Cluster frame silhouette values for cluster %i\n", Ci->Num()); + for (SilPairArray::const_iterator it = SiVals.begin(); it != SiVals.end(); ++it) + mprintf("\t%8i %g\n", it->first+1, it->second); + } + if (ci_frames > 0) + avg_si /= (double)ci_frames; + //mprintf("DEBUG: Cluster silhouette: %8i %g\n", Ci->Num(), avg_si); + clusterAvgSil_.push_back( avg_si ); + } // END outer loop over clusters + return 0; +} + +/** Print an error message if the number of clusters does not match. */ +int Silhouette::numMismatchErr(const char* desc, unsigned int nclusters) const { + if (nclusters != clusterFrameSil_.size()) { + mprinterr("Internal Error: During '%s', # given clusters (%u) != clusterFrameSil size (%zu)\n", + desc, nclusters, clusterFrameSil_.size()); + return 1; + } + if (nclusters != clusterAvgSil_.size()) { + mprinterr("Internal Error: During '%s', # given clusters (%u) != clusterAvgSil size (%zu)\n", + desc, nclusters, clusterAvgSil_.size()); + return 1; + } + return 0; +} + +/** Print cluster silhouette frame values, sorted by silhouette. */ +int Silhouette::PrintSilhouetteFrames(CpptrajFile& Ffile, List const& clusters) +const +{ + if (numMismatchErr("PrintSilhouetteFrames", clusters.Nclusters())) return 1; + // TODO different ways of writing out cluster frame silhouettes, like sort index? + unsigned int idx = 0; + List::cluster_iterator Ci = clusters.begincluster(); + for (SilFrameArray::const_iterator it = clusterFrameSil_.begin(); + it != clusterFrameSil_.end(); ++it, ++Ci) + { + Ffile.Printf("#C%-6i %10s\n", Ci->Num(), "Silhouette"); + SilPairArray spaTemp = *it; + std::sort( spaTemp.begin(), spaTemp.end(), sort_by_sil_val() ); + for (SilPairArray::const_iterator jt = spaTemp.begin(); + jt != spaTemp.end(); ++jt, ++idx) + { + if (silIdxType_ == IDX_FRAME) + Ffile.Printf("%8i %g\n", jt->first + 1, jt->second); + else // IDX_SORTED, IDX_NOT_SPECIFIED + Ffile.Printf("%8u %g\n", idx, jt->second); + } + Ffile.Printf("\n"); + } + return 0; +} + +/** Print average cluster silhouette values. */ +int Silhouette::PrintAvgSilhouettes(CpptrajFile& Cfile, List const& clusters) +const +{ + if (numMismatchErr("PrintAvgSilhouettes", clusters.Nclusters())) return 1; + // TODO is it ok to assume clusters are in order? + Cfile.Printf("%-8s %10s\n", "#Cluster", ""); + List::cluster_iterator Ci = clusters.begincluster(); + for (Darray::const_iterator it = clusterAvgSil_.begin(); + it != clusterAvgSil_.end(); ++it, ++Ci) + Cfile.Printf("%8i %g\n", Ci->Num(), *it); + return 0; +} + diff --git a/src/Cluster/Silhouette.h b/src/Cluster/Silhouette.h new file mode 100644 index 0000000000..24e0fc4c42 --- /dev/null +++ b/src/Cluster/Silhouette.h @@ -0,0 +1,70 @@ +#ifndef INC_CLUSTER_SILHOUETTE_H +#define INC_CLUSTER_SILHOUETTE_H +#include +#include // std::pair +class CpptrajFile; +namespace Cpptraj { +namespace Cluster { +class List; +class MetricArray; +/// Cluster silhouette calculation +class Silhouette { + /// Used to pair frame numbers with silhouette values. + typedef std::pair SilPair; + /// Used to hold list of frame numbers/silhouette values. + typedef std::vector SilPairArray; + + public: + /// CONSTRUCTOR - set debug level + Silhouette(int); + + /// Used to determine what indices will be printed out for frame silhouette values + enum IdxType { IDX_SORTED = 0, IDX_FRAME, IDX_NOT_SPECIFIED }; + + /// Used to hold SilPairArray for each cluster + typedef std::vector SilFrameArray; + /// Generic double array + typedef std::vector Darray; + + /// Initialize with frame silhouette index type + int Init(IdxType); + + /// \return Array containing frame silhouette values of each frame for every cluster + SilFrameArray const& SilhouetteFrameArray() const { return clusterFrameSil_; } + /// \return Array containing average silhouette values for every cluster + Darray const& AvgSilhouetteArray() const { return clusterAvgSil_; } + + /// Print Silhouette frame values to file. + int PrintSilhouetteFrames(CpptrajFile&, List const&) const; + /// Print average Silhouette values to file. + int PrintAvgSilhouettes(CpptrajFile&, List const&) const; + + /// Calculate silhouette values for each cluster + int CalcSilhouette(List const&, MetricArray&, std::vector const&, bool); + private: + /// \return 1 if given # clusters does not match what is in Silhouette + int numMismatchErr(const char*, unsigned int) const; + + /// For sorting cluster frame silhouettes by silhouette value. + struct sort_by_sil_val { + inline bool operator()(SilPair const& p0, SilPair const& p1) + { + if (p0.second == p1.second) + return (p0.first < p1.first); + else + return (p0.second < p1.second); + } + }; + + /// For each cluster, hold silhouette values for each frame in the cluster. + SilFrameArray clusterFrameSil_; + /// For each cluster, hold avg. silhouette value for cluster (TODO s.d. as well?) + Darray clusterAvgSil_; + /// debug level + int debug_; + /// Control what indices will be used when printing frame silhouette values + IdxType silIdxType_; +}; +} +} +#endif diff --git a/src/Cluster/clusterdepend b/src/Cluster/clusterdepend index cf51e1e619..909eb6cec8 100644 --- a/src/Cluster/clusterdepend +++ b/src/Cluster/clusterdepend @@ -9,7 +9,7 @@ Centroid_Coord.o : Centroid_Coord.cpp ../Atom.h ../AtomMask.h ../Box.h ../Coordi Cframes.o : Cframes.cpp Cframes.h Cmatrix_Binary.o : Cmatrix_Binary.cpp ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Parallel.h Cframes.h Cmatrix_Binary.h Cmatrix_NC.o : Cmatrix_NC.cpp ../CpptrajStdio.h ../FileName.h ../NC_Routines.h Cframes.h Cmatrix_NC.h -Control.o : Control.cpp ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../BufferedLine.h ../Cluster/Cframes.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataFile.h ../DataFileList.h ../DataSet.h ../DataSetList.h ../DataSet_1D.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../DataSet_PairwiseCache.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Random.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Algorithm.h Algorithm_DBscan.h Algorithm_DPeaks.h Algorithm_HierAgglo.h Algorithm_Kmeans.h BestReps.h CentroidArray.h Cframes.h Control.h DrawGraph.h DynamicMatrix.h List.h Metric.h MetricArray.h Node.h Output.h Results.h Results_Coords.h Sieve.h +Control.o : Control.cpp ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../BufferedLine.h ../Cluster/Cframes.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataFile.h ../DataFileList.h ../DataSet.h ../DataSetList.h ../DataSet_1D.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../DataSet_PairwiseCache.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Random.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Algorithm.h Algorithm_DBscan.h Algorithm_DPeaks.h Algorithm_HierAgglo.h Algorithm_Kmeans.h BestReps.h CentroidArray.h Cframes.h Control.h DrawGraph.h DynamicMatrix.h List.h Metric.h MetricArray.h Node.h Output.h Results.h Results_Coords.h Sieve.h Silhouette.h DrawGraph.o : DrawGraph.cpp ../AssociatedData.h ../Atom.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../NameType.h ../PDBfile.h ../Parallel.h ../Range.h ../Residue.h ../SymbolExporting.h ../TextFormat.h ../Vec3.h Cframes.h DrawGraph.h Metric.h MetricArray.h DynamicMatrix.o : DynamicMatrix.cpp ../ArrayIterator.h ../CpptrajStdio.h ../Matrix.h DynamicMatrix.h List.o : List.cpp ../AssociatedData.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../ProgressBar.h ../Range.h ../TextFormat.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h @@ -20,6 +20,7 @@ Metric_SRMSD.o : Metric_SRMSD.cpp ../ArrayIterator.h ../AssociatedData.h ../Atom Metric_Scalar.o : Metric_Scalar.cpp ../AssociatedData.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Range.h ../TextFormat.h Centroid.h Centroid_Num.h Cframes.h Metric.h Metric_Scalar.h Metric_Torsion.o : Metric_Torsion.cpp ../AssociatedData.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Range.h ../TextFormat.h Centroid.h Centroid_Num.h Cframes.h Metric.h Metric_Torsion.h Node.o : Node.cpp ../AssociatedData.h ../CpptrajFile.h ../DataSet.h ../DataSet_1D.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Range.h ../TextFormat.h CentroidArray.h Cframes.h Metric.h MetricArray.h Node.h -Output.o : Output.cpp ../ArrayIterator.h ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Matrix.h ../Parallel.h Algorithm.h BestReps.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h Output.h +Output.o : Output.cpp ../ArrayIterator.h ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Matrix.h ../Parallel.h ../Timer.h Algorithm.h BestReps.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h Output.h Results_Coords.o : Results_Coords.cpp ../ActionFrameCounter.h ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMap.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSetList.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../Hungarian.h ../MapAtom.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../OutputTrajCommon.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../StringRoutines.h ../SymbolExporting.h ../SymmetricRmsdCalc.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../Trajout_Single.h ../TypeNameHolder.h ../Unit.h ../Vec3.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Metric_DME.h Metric_RMS.h Metric_SRMSD.h Node.h Results.h Results_Coords.h Sieve.o : Sieve.cpp ../AssociatedData.h ../Cluster/Cframes.h ../CpptrajFile.h ../DataSet.h ../DataSet_PairwiseCache.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Random.h ../Range.h ../TextFormat.h Cframes.h Sieve.h +Silhouette.o : Silhouette.cpp ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Parallel.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h Silhouette.h diff --git a/src/Cluster/clusterfiles b/src/Cluster/clusterfiles index fbdd6c0e17..c1f870778a 100644 --- a/src/Cluster/clusterfiles +++ b/src/Cluster/clusterfiles @@ -24,5 +24,5 @@ CLUSTER_SOURCES= \ Node.cpp \ Output.cpp \ Results_Coords.cpp \ - Sieve.cpp - + Sieve.cpp \ + Silhouette.cpp diff --git a/src/Frame.cpp b/src/Frame.cpp index 32dbd0aa50..abd8d91c1d 100644 --- a/src/Frame.cpp +++ b/src/Frame.cpp @@ -896,6 +896,7 @@ int Frame::Divide(Frame const& dividend, double divisor) { */ void Frame::Divide(double divisor) { if (divisor < Constants::SMALL) { + //mprintf("DEBUG: Frame::Divide(divisor): Detected divide by 0.\n"); mprinterr("Error: Frame::Divide(divisor): Detected divide by 0.\n"); return; } diff --git a/src/Version.h b/src/Version.h index e6a033fb95..5ad6d1e7b5 100644 --- a/src/Version.h +++ b/src/Version.h @@ -12,7 +12,7 @@ * Whenever a number that precedes is incremented, all subsequent * numbers should be reset to 0. */ -#define CPPTRAJ_INTERNAL_VERSION "V6.1.1" +#define CPPTRAJ_INTERNAL_VERSION "V6.2.0" /// PYTRAJ relies on this #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION #endif diff --git a/test/Test_Cluster/mysil.frame.dat.save b/test/Test_Cluster/mysil.frame.dat.save index e40f666188..1472223bf5 100644 --- a/test/Test_Cluster/mysil.frame.dat.save +++ b/test/Test_Cluster/mysil.frame.dat.save @@ -48,72 +48,72 @@ 46 0.330269 #C1 Silhouette - 48 0.319039 - 49 0.319785 - 50 0.348833 - 51 0.358286 - 52 0.384205 - 53 0.397178 - 54 0.428314 - 55 0.441785 - 56 0.44628 - 57 0.446432 - 58 0.44892 - 59 0.465936 - 60 0.472731 - 61 0.473003 - 62 0.48261 - 63 0.486779 - 64 0.491815 - 65 0.493443 - 66 0.49616 - 67 0.496202 - 68 0.506839 - 69 0.523232 - 70 0.523974 - 71 0.524305 + 47 0.319039 + 48 0.319785 + 49 0.348833 + 50 0.358286 + 51 0.384205 + 52 0.397178 + 53 0.428314 + 54 0.441785 + 55 0.44628 + 56 0.446432 + 57 0.44892 + 58 0.465936 + 59 0.472731 + 60 0.473003 + 61 0.48261 + 62 0.486779 + 63 0.491815 + 64 0.493443 + 65 0.49616 + 66 0.496202 + 67 0.506839 + 68 0.523232 + 69 0.523974 + 70 0.524305 #C2 Silhouette - 73 -0.0273079 - 74 0.143221 - 75 0.143939 - 76 0.179135 - 77 0.198277 - 78 0.217643 - 79 0.240545 - 80 0.25015 - 81 0.253356 - 82 0.28631 - 83 0.291313 - 84 0.329007 + 71 -0.0273079 + 72 0.143221 + 73 0.143939 + 74 0.179135 + 75 0.198277 + 76 0.217643 + 77 0.240545 + 78 0.25015 + 79 0.253356 + 80 0.28631 + 81 0.291313 + 82 0.329007 #C3 Silhouette - 86 0.181779 - 87 0.183371 - 88 0.259752 - 89 0.264702 - 90 0.287487 - 91 0.32422 + 83 0.181779 + 84 0.183371 + 85 0.259752 + 86 0.264702 + 87 0.287487 + 88 0.32422 #C4 Silhouette - 93 0.0272266 - 94 0.176844 - 95 0.208565 - 96 0.353716 - 97 0.40685 + 89 0.0272266 + 90 0.176844 + 91 0.208565 + 92 0.353716 + 93 0.40685 #C5 Silhouette - 99 0.101416 - 100 0.195411 + 94 0.101416 + 95 0.195411 #C6 Silhouette - 102 0.100473 - 103 0.162116 + 96 0.100473 + 97 0.162116 #C7 Silhouette - 105 0.582066 - 106 0.589914 + 98 0.582066 + 99 0.589914 #C8 Silhouette - 108 1 + 100 1