From e0e94f8722140c9a7e97aa5ebd6c985e37c8f8c8 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 24 Jan 2022 12:54:32 -0500 Subject: [PATCH 01/23] Start experimental command to compare clusters --- src/Command.cpp | 2 ++ src/Exec_CompareClusters.cpp | 42 ++++++++++++++++++++++++++++++++++++ src/Exec_CompareClusters.h | 14 ++++++++++++ src/cpptrajdepend | 5 +++-- src/cpptrajfiles | 1 + 5 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 src/Exec_CompareClusters.cpp create mode 100644 src/Exec_CompareClusters.h diff --git a/src/Command.cpp b/src/Command.cpp index 4854391fc5..96306da432 100644 --- a/src/Command.cpp +++ b/src/Command.cpp @@ -37,6 +37,7 @@ #include "Exec_Set.h" #include "Exec_Show.h" #include "Exec_Random.h" +#include "Exec_CompareClusters.h" // ----- SYSTEM ---------------------------------------------------------------- #include "Exec_System.h" // ----- COORDS ---------------------------------------------------------------- @@ -213,6 +214,7 @@ void Command::Init() { Command::AddCmd( new Exec_Calc(), Cmd::EXE, 1, "calc" ); Command::AddCmd( new Exec_Clear(), Cmd::EXE, 1, "clear" ); Command::AddCmd( new Exec_ClusterMap(), Cmd::EXE, 1, "clustermap" ); // hidden + Command::AddCmd( new Exec_CompareClusters(), Cmd::EXE, 1, "compareclusters" ); //hidden Command::AddCmd( new Exec_CreateDataFile(), Cmd::EXE, 1, "create" ); Command::AddCmd( new Exec_CreateSet(), Cmd::EXE, 1, "createset" ); Command::AddCmd( new Exec_DataFileCmd(), Cmd::EXE, 1, "datafile" ); diff --git a/src/Exec_CompareClusters.cpp b/src/Exec_CompareClusters.cpp new file mode 100644 index 0000000000..18f5e4178a --- /dev/null +++ b/src/Exec_CompareClusters.cpp @@ -0,0 +1,42 @@ +#include "Exec_CompareClusters.h" +#include "CpptrajStdio.h" + +/** CONSTRUCTOR. */ +Exec_CompareClusters::Exec_CompareClusters() : + Exec(GENERAL) +{ + SetHidden( true ); +} + +// Exec_CompareClusters::Help() +void Exec_CompareClusters::Help() const +{ + +} + +/** Get cluster number vs time data set. */ +DataSet* Exec_CompareClusters::getClusterSet(std::string const& dsname, DataSetList const& DSL) { + DataSet* ds = DSL.GetDataSet( dsname ); + if (ds == 0) { + mprinterr("Error: '%s' does not correspond to a data set.\n", dsname.c_str()); + return 0; + } + if (ds->Group() != DataSet::SCALAR_1D) { + mprinterr("Error: '%s' is not a 1D scalar data set.\n", dsname.c_str()); + return 0; + } + return ds; +} + +// Exec_CompareClusters::Execute() +Exec::RetType Exec_CompareClusters::Execute(CpptrajState& State, ArgList& argIn) +{ + DataSet* c0 = getClusterSet( argIn.GetStringKey("set"), State.DSL() ); + if (c0 == 0) return CpptrajState::ERR; + DataSet* c1 = getClusterSet( argIn.GetStringKey("set"), State.DSL() ); + if (c1 == 0) return CpptrajState::ERR; + mprintf("\tCluster set 0: '%s'\n", c0->legend()); + mprintf("\tCluster set 1: '%s'\n", c1->legend()); + + return CpptrajState::OK; +} diff --git a/src/Exec_CompareClusters.h b/src/Exec_CompareClusters.h new file mode 100644 index 0000000000..29874fcfc1 --- /dev/null +++ b/src/Exec_CompareClusters.h @@ -0,0 +1,14 @@ +#ifndef INC_EXEC_COMPARECLUSTERS_H +#define INC_EXEC_COMPARECLUSTERS_H +#include "Exec.h" +/// Compare two cluster number vs time sets +class Exec_CompareClusters : public Exec { + public: + Exec_CompareClusters(); + void Help() const; + DispatchObject* Alloc() const { return (DispatchObject*)new Exec_CompareClusters(); } + RetType Execute(CpptrajState&, ArgList&); + private: + static DataSet* getClusterSet(std::string const&, DataSetList const&); +}; +#endif diff --git a/src/cpptrajdepend b/src/cpptrajdepend index f32546bd69..3597353744 100644 --- a/src/cpptrajdepend +++ b/src/cpptrajdepend @@ -92,7 +92,7 @@ AnalysisList.o : AnalysisList.cpp ActionState.h Analysis.h AnalysisList.h Analys Analysis_AmdBias.o : Analysis_AmdBias.cpp ActionState.h Analysis.h AnalysisState.h Analysis_AmdBias.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_double.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Analysis_AutoCorr.o : Analysis_AutoCorr.cpp ActionState.h Analysis.h AnalysisState.h Analysis_AutoCorr.h ArgList.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h ComplexArray.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_Vector.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Analysis_Average.o : Analysis_Average.cpp ActionState.h Analysis.h AnalysisState.h Analysis_Average.h ArgList.h Array1D.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h -Analysis_Clustering.o : Analysis_Clustering.cpp ActionState.h Analysis.h AnalysisState.h Analysis_Clustering.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cluster/Algorithm.h Cluster/BestReps.h Cluster/CentroidArray.h Cluster/Cframes.h Cluster/Control.h Cluster/DrawGraph.h Cluster/List.h Cluster/Metric.h Cluster/MetricArray.h Cluster/Node.h Cluster/Sieve.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h +Analysis_Clustering.o : Analysis_Clustering.cpp ActionState.h Analysis.h AnalysisState.h Analysis_Clustering.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cluster/Algorithm.h Cluster/BestReps.h Cluster/CentroidArray.h Cluster/Cframes.h Cluster/Control.h Cluster/DrawGraph.h Cluster/List.h Cluster/Metric.h Cluster/MetricArray.h Cluster/Node.h Cluster/Sieve.h Cluster/Silhouette.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Analysis_ConstantPHStats.o : Analysis_ConstantPHStats.cpp ActionState.h Analysis.h AnalysisState.h Analysis_ConstantPHStats.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h Cph.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_Mesh.h DataSet_pH.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h Spline.h StringRoutines.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Analysis_Corr.o : Analysis_Corr.cpp ActionState.h Analysis.h AnalysisState.h Analysis_Corr.h ArgList.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h ComplexArray.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_Vector.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h Analysis_CrankShaft.o : Analysis_CrankShaft.cpp ActionState.h Analysis.h AnalysisState.h Analysis_CrankShaft.h Analysis_Statistics.h ArgList.h Array1D.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_float.h DataSet_integer.h DataSet_string.h Dimension.h DispatchObject.h FileIO.h FileName.h FileTypes.h Frame.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TypeNameHolder.h Unit.h Vec3.h @@ -150,7 +150,7 @@ ClusterMap.o : ClusterMap.cpp AssociatedData.h ClusterMap.h Constants.h CpptrajF Cmd.o : Cmd.cpp Cmd.h DispatchObject.h CmdInput.o : CmdInput.cpp CmdInput.h StringRoutines.h CmdList.o : CmdList.cpp Cmd.h CmdList.h DispatchObject.h -Command.o : Command.cpp Action.h ActionFrameCounter.h ActionList.h ActionState.h ActionTopWriter.h Action_Align.h Action_Angle.h Action_AreaPerMol.h Action_AtomMap.h Action_AtomicCorr.h Action_AtomicFluct.h Action_AutoImage.h Action_Average.h Action_Bounds.h Action_Box.h Action_Center.h Action_Channel.h Action_CheckChirality.h Action_CheckStructure.h Action_Closest.h Action_ClusterDihedral.h Action_Contacts.h Action_CreateCrd.h Action_CreateReservoir.h Action_DNAionTracker.h Action_DSSP.h Action_Density.h Action_Diffusion.h Action_Dihedral.h Action_DihedralRMS.h Action_Dipole.h Action_DistRmsd.h Action_Distance.h Action_Energy.h Action_Esander.h Action_FilterByData.h Action_FixAtomOrder.h Action_FixImagedBonds.h Action_GIST.h Action_Grid.h Action_GridFreeEnergy.h Action_HydrogenBond.h Action_Image.h Action_InfraredSpectrum.h Action_Jcoupling.h Action_LESsplit.h Action_LIE.h Action_LipidOrder.h Action_MakeStructure.h Action_Mask.h Action_Matrix.h Action_MinImage.h Action_Molsurf.h Action_MultiDihedral.h Action_MultiPucker.h Action_MultiVector.h Action_NAstruct.h Action_NMRrst.h Action_NativeContacts.h Action_OrderParameter.h Action_Outtraj.h Action_PairDist.h Action_Pairwise.h Action_Principal.h Action_Projection.h Action_Pucker.h Action_Radgyr.h Action_Radial.h Action_RandomizeIons.h Action_Remap.h Action_ReplicateCell.h Action_Rmsd.h Action_Rotate.h Action_RunningAvg.h Action_STFC_Diffusion.h Action_Scale.h Action_SetVelocity.h Action_Spam.h Action_Strip.h Action_Surf.h Action_SymmetricRmsd.h Action_Temperature.h Action_Time.h Action_Translate.h Action_Unstrip.h Action_Unwrap.h Action_Vector.h Action_VelocityAutoCorr.h Action_Volmap.h Action_Volume.h Action_Watershell.h Action_XtalSymm.h Analysis.h AnalysisList.h AnalysisState.h Analysis_AmdBias.h Analysis_AutoCorr.h Analysis_Average.h Analysis_Clustering.h Analysis_ConstantPHStats.h Analysis_Corr.h Analysis_CrankShaft.h Analysis_CrdFluct.h Analysis_CrossCorr.h Analysis_CurveFit.h Analysis_Divergence.h Analysis_EvalPlateau.h Analysis_FFT.h Analysis_HausdorffDistance.h Analysis_Hist.h Analysis_IRED.h Analysis_Integrate.h Analysis_KDE.h Analysis_Lifetime.h Analysis_LowestCurve.h Analysis_Matrix.h Analysis_MeltCurve.h Analysis_Modes.h Analysis_MultiHist.h Analysis_Multicurve.h Analysis_Overlap.h Analysis_PhiPsi.h Analysis_Regression.h Analysis_RemLog.h Analysis_Rms2d.h Analysis_RmsAvgCorr.h Analysis_Rotdif.h Analysis_RunningAvg.h Analysis_Slope.h Analysis_Spline.h Analysis_State.h Analysis_Statistics.h Analysis_TI.h Analysis_Timecorr.h Analysis_VectorMath.h Analysis_Wavelet.h ArgList.h Array1D.h ArrayIterator.h AssociatedData.h Atom.h AtomMap.h AtomMask.h AtomType.h AxisType.h BaseIOtype.h Box.h BoxArgs.h BufferedLine.h CharMask.h Cluster/Algorithm.h Cluster/BestReps.h Cluster/CentroidArray.h Cluster/Cframes.h Cluster/Control.h Cluster/DrawGraph.h Cluster/List.h Cluster/Metric.h Cluster/MetricArray.h Cluster/Node.h Cluster/Sieve.h ClusterMap.h Cmd.h CmdInput.h CmdList.h Command.h CompactFrameArray.h ComplexArray.h Constants.h Constraints.h ControlBlock.h ControlBlock_For.h CoordinateInfo.h Corr.h Cph.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataFilter.h DataSet.h DataSetList.h DataSet_1D.h DataSet_2D.h DataSet_3D.h DataSet_Coords.h DataSet_Coords_CRD.h DataSet_Coords_REF.h DataSet_GridFlt.h DataSet_Mat3x3.h DataSet_MatrixDbl.h DataSet_MatrixFlt.h DataSet_Mesh.h DataSet_Modes.h DataSet_RemLog.h DataSet_Vector.h DataSet_double.h DataSet_float.h DataSet_integer.h DataSet_integer_mem.h DataSet_pH.h DataSet_string.h Deprecated.h DihedralSearch.h Dimension.h DispatchObject.h Energy.h Energy_Sander.h EnsembleIn.h EnsembleOutList.h Ewald.h EwaldOptions.h Ewald_ParticleMesh.h ExclusionArray.h Exec.h Exec_AddMissingRes.h Exec_Analyze.h Exec_Calc.h Exec_CatCrd.h Exec_Change.h Exec_ClusterMap.h Exec_CombineCoords.h Exec_Commands.h Exec_CompareTop.h Exec_CrdAction.h Exec_CrdOut.h Exec_CreateSet.h Exec_DataFile.h Exec_DataFilter.h Exec_DataSetCmd.h Exec_Emin.h Exec_Flatten.h Exec_GenerateAmberRst.h Exec_Graft.h Exec_Help.h Exec_LoadCrd.h Exec_LoadTraj.h Exec_ParallelAnalysis.h Exec_ParmBox.h Exec_ParmSolvent.h Exec_ParmStrip.h Exec_ParmWrite.h Exec_PermuteDihedrals.h Exec_Precision.h Exec_PrepareForLeap.h Exec_PrintData.h Exec_Random.h Exec_ReadData.h Exec_ReadEnsembleData.h Exec_ReadInput.h Exec_RotateDihedral.h Exec_RunAnalysis.h Exec_ScaleDihedralK.h Exec_SequenceAlign.h Exec_Set.h Exec_Show.h Exec_SortEnsembleData.h Exec_SplitCoords.h Exec_System.h Exec_Top.h Exec_Traj.h Exec_UpdateParameters.h Exec_ViewRst.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h GIST_PME.h Grid.h GridAction.h GridBin.h HistBin.h Hungarian.h ImageOption.h ImageTypes.h InputTrajCommon.h MapAtom.h MaskArray.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h NetcdfFile.h OnlineVarT.h OutputTrajCommon.h PDBfile.h PairList.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h PubFFT.h Pucker.h Pucker_PuckerMask.h Pucker_PuckerSearch.h Pucker_PuckerToken.h RPNcalc.h Random.h Range.h ReferenceAction.h ReferenceFrame.h RemdReservoirNC.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h Spline.h SplineFxnTable.h StructureCheck.h SymbolExporting.h SymmetricRmsdCalc.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h cuda_kernels/GistCudaSetup.cuh helpme_standalone.h molsurf.h +Command.o : Command.cpp Action.h ActionFrameCounter.h ActionList.h ActionState.h ActionTopWriter.h Action_Align.h Action_Angle.h Action_AreaPerMol.h Action_AtomMap.h Action_AtomicCorr.h Action_AtomicFluct.h Action_AutoImage.h Action_Average.h Action_Bounds.h Action_Box.h Action_Center.h Action_Channel.h Action_CheckChirality.h Action_CheckStructure.h Action_Closest.h Action_ClusterDihedral.h Action_Contacts.h Action_CreateCrd.h Action_CreateReservoir.h Action_DNAionTracker.h Action_DSSP.h Action_Density.h Action_Diffusion.h Action_Dihedral.h Action_DihedralRMS.h Action_Dipole.h Action_DistRmsd.h Action_Distance.h Action_Energy.h Action_Esander.h Action_FilterByData.h Action_FixAtomOrder.h Action_FixImagedBonds.h Action_GIST.h Action_Grid.h Action_GridFreeEnergy.h Action_HydrogenBond.h Action_Image.h Action_InfraredSpectrum.h Action_Jcoupling.h Action_LESsplit.h Action_LIE.h Action_LipidOrder.h Action_MakeStructure.h Action_Mask.h Action_Matrix.h Action_MinImage.h Action_Molsurf.h Action_MultiDihedral.h Action_MultiPucker.h Action_MultiVector.h Action_NAstruct.h Action_NMRrst.h Action_NativeContacts.h Action_OrderParameter.h Action_Outtraj.h Action_PairDist.h Action_Pairwise.h Action_Principal.h Action_Projection.h Action_Pucker.h Action_Radgyr.h Action_Radial.h Action_RandomizeIons.h Action_Remap.h Action_ReplicateCell.h Action_Rmsd.h Action_Rotate.h Action_RunningAvg.h Action_STFC_Diffusion.h Action_Scale.h Action_SetVelocity.h Action_Spam.h Action_Strip.h Action_Surf.h Action_SymmetricRmsd.h Action_Temperature.h Action_Time.h Action_Translate.h Action_Unstrip.h Action_Unwrap.h Action_Vector.h Action_VelocityAutoCorr.h Action_Volmap.h Action_Volume.h Action_Watershell.h Action_XtalSymm.h Analysis.h AnalysisList.h AnalysisState.h Analysis_AmdBias.h Analysis_AutoCorr.h Analysis_Average.h Analysis_Clustering.h Analysis_ConstantPHStats.h Analysis_Corr.h Analysis_CrankShaft.h Analysis_CrdFluct.h Analysis_CrossCorr.h Analysis_CurveFit.h Analysis_Divergence.h Analysis_EvalPlateau.h Analysis_FFT.h Analysis_HausdorffDistance.h Analysis_Hist.h Analysis_IRED.h Analysis_Integrate.h Analysis_KDE.h Analysis_Lifetime.h Analysis_LowestCurve.h Analysis_Matrix.h Analysis_MeltCurve.h Analysis_Modes.h Analysis_MultiHist.h Analysis_Multicurve.h Analysis_Overlap.h Analysis_PhiPsi.h Analysis_Regression.h Analysis_RemLog.h Analysis_Rms2d.h Analysis_RmsAvgCorr.h Analysis_Rotdif.h Analysis_RunningAvg.h Analysis_Slope.h Analysis_Spline.h Analysis_State.h Analysis_Statistics.h Analysis_TI.h Analysis_Timecorr.h Analysis_VectorMath.h Analysis_Wavelet.h ArgList.h Array1D.h ArrayIterator.h AssociatedData.h Atom.h AtomMap.h AtomMask.h AtomType.h AxisType.h BaseIOtype.h Box.h BoxArgs.h BufferedLine.h CharMask.h Cluster/Algorithm.h Cluster/BestReps.h Cluster/CentroidArray.h Cluster/Cframes.h Cluster/Control.h Cluster/DrawGraph.h Cluster/List.h Cluster/Metric.h Cluster/MetricArray.h Cluster/Node.h Cluster/Sieve.h Cluster/Silhouette.h ClusterMap.h Cmd.h CmdInput.h CmdList.h Command.h CompactFrameArray.h ComplexArray.h Constants.h Constraints.h ControlBlock.h ControlBlock_For.h CoordinateInfo.h Corr.h Cph.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataFilter.h DataSet.h DataSetList.h DataSet_1D.h DataSet_2D.h DataSet_3D.h DataSet_Coords.h DataSet_Coords_CRD.h DataSet_Coords_REF.h DataSet_GridFlt.h DataSet_Mat3x3.h DataSet_MatrixDbl.h DataSet_MatrixFlt.h DataSet_Mesh.h DataSet_Modes.h DataSet_RemLog.h DataSet_Vector.h DataSet_double.h DataSet_float.h DataSet_integer.h DataSet_integer_mem.h DataSet_pH.h DataSet_string.h Deprecated.h DihedralSearch.h Dimension.h DispatchObject.h Energy.h Energy_Sander.h EnsembleIn.h EnsembleOutList.h Ewald.h EwaldOptions.h Ewald_ParticleMesh.h ExclusionArray.h Exec.h Exec_AddMissingRes.h Exec_Analyze.h Exec_Calc.h Exec_CatCrd.h Exec_Change.h Exec_ClusterMap.h Exec_CombineCoords.h Exec_Commands.h Exec_CompareClusters.h Exec_CompareTop.h Exec_CrdAction.h Exec_CrdOut.h Exec_CreateSet.h Exec_DataFile.h Exec_DataFilter.h Exec_DataSetCmd.h Exec_Emin.h Exec_Flatten.h Exec_GenerateAmberRst.h Exec_Graft.h Exec_Help.h Exec_LoadCrd.h Exec_LoadTraj.h Exec_ParallelAnalysis.h Exec_ParmBox.h Exec_ParmSolvent.h Exec_ParmStrip.h Exec_ParmWrite.h Exec_PermuteDihedrals.h Exec_Precision.h Exec_PrepareForLeap.h Exec_PrintData.h Exec_Random.h Exec_ReadData.h Exec_ReadEnsembleData.h Exec_ReadInput.h Exec_RotateDihedral.h Exec_RunAnalysis.h Exec_ScaleDihedralK.h Exec_SequenceAlign.h Exec_Set.h Exec_Show.h Exec_SortEnsembleData.h Exec_SplitCoords.h Exec_System.h Exec_Top.h Exec_Traj.h Exec_UpdateParameters.h Exec_ViewRst.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h GIST_PME.h Grid.h GridAction.h GridBin.h HistBin.h Hungarian.h ImageOption.h ImageTypes.h InputTrajCommon.h MapAtom.h MaskArray.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h NetcdfFile.h OnlineVarT.h OutputTrajCommon.h PDBfile.h PairList.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h PubFFT.h Pucker.h Pucker_PuckerMask.h Pucker_PuckerSearch.h Pucker_PuckerToken.h RPNcalc.h Random.h Range.h ReferenceAction.h ReferenceFrame.h RemdReservoirNC.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h Spline.h SplineFxnTable.h StructureCheck.h SymbolExporting.h SymmetricRmsdCalc.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h cuda_kernels/GistCudaSetup.cuh helpme_standalone.h molsurf.h CompactFrameArray.o : CompactFrameArray.cpp Box.h CompactFrameArray.h CoordinateInfo.h CpptrajStdio.h Matrix_3x3.h Parallel.h ReplicaDimArray.h Vec3.h ComplexArray.o : ComplexArray.cpp ArrayIterator.h ComplexArray.h Constraints.o : Constraints.cpp ArgList.h Atom.h AtomMask.h AtomType.h Box.h CharMask.h Constants.h Constraints.h CoordinateInfo.h CpptrajStdio.h FileName.h Frame.h MaskToken.h Matrix_3x3.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReplicaDimArray.h Residue.h Segment.h SymbolExporting.h Topology.h TypeNameHolder.h Unit.h Vec3.h @@ -247,6 +247,7 @@ Exec_Change.o : Exec_Change.cpp Action.h ActionList.h ActionState.h Analysis.h A Exec_ClusterMap.o : Exec_ClusterMap.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h ClusterMap.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_2D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_MatrixFlt.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_ClusterMap.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CombineCoords.o : Exec_CombineCoords.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CombineCoords.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_Commands.o : Exec_Commands.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_Commands.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h +Exec_CompareClusters.o : Exec_CompareClusters.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CompareClusters.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CompareTop.o : Exec_CompareTop.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CompareTop.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CrdAction.o : Exec_CrdAction.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cmd.h CmdList.h Command.h CompactFrameArray.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_CRD.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CrdAction.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ProgressBar.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CrdOut.o : Exec_CrdOut.cpp Action.h ActionFrameCounter.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CrdOut.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h OutputTrajCommon.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ProgressBar.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h diff --git a/src/cpptrajfiles b/src/cpptrajfiles index e3befea6c9..fa16387fa7 100644 --- a/src/cpptrajfiles +++ b/src/cpptrajfiles @@ -247,6 +247,7 @@ COMMON_SOURCES= \ Exec_ClusterMap.cpp \ Exec_CombineCoords.cpp \ Exec_Commands.cpp \ + Exec_CompareClusters.cpp \ Exec_CompareTop.cpp \ Exec_CrdAction.cpp \ Exec_CrdOut.cpp \ From 9a0d0c008ef227fc03ba70783a077b287fd855a4 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 24 Jan 2022 13:26:22 -0500 Subject: [PATCH 02/23] Report some stats --- src/Exec_CompareClusters.cpp | 104 +++++++++++++++++++++++++++++++++-- src/Exec_CompareClusters.h | 2 + src/cpptrajdepend | 2 +- 3 files changed, 101 insertions(+), 7 deletions(-) diff --git a/src/Exec_CompareClusters.cpp b/src/Exec_CompareClusters.cpp index 18f5e4178a..8377cf8938 100644 --- a/src/Exec_CompareClusters.cpp +++ b/src/Exec_CompareClusters.cpp @@ -1,5 +1,6 @@ #include "Exec_CompareClusters.h" #include "CpptrajStdio.h" +#include "DataSet_1D.h" /** CONSTRUCTOR. */ Exec_CompareClusters::Exec_CompareClusters() : @@ -8,12 +9,6 @@ Exec_CompareClusters::Exec_CompareClusters() : SetHidden( true ); } -// Exec_CompareClusters::Help() -void Exec_CompareClusters::Help() const -{ - -} - /** Get cluster number vs time data set. */ DataSet* Exec_CompareClusters::getClusterSet(std::string const& dsname, DataSetList const& DSL) { DataSet* ds = DSL.GetDataSet( dsname ); @@ -28,6 +23,101 @@ DataSet* Exec_CompareClusters::getClusterSet(std::string const& dsname, DataSetL return ds; } +/** For tracking same/different frames in clusters. */ +class ClusterStat { + public: + ClusterStat() : n_in_common_(0) {} + /// Both sets had frame assigned to same cluster + void FrameInCommon() { n_in_common_++; } + /// Increment counter for specified set only + void IncrementSet(unsigned int i) { + if (i >= n_in_set_.size()) + n_in_set_.resize(i + 1, 0); + n_in_set_[i]++; + } + /// Header to stdout + static void Header() { + mprintf("%-12s %12s %12s %12s\n", "#Cluster", "Common", "Set0", "Set1"); + } + /// Print to stdout + void Print(unsigned int cnum) { + if (n_in_set_.size() < 2) n_in_set_.resize(2, 0); + mprintf("%12i %12u %12u %12u\n", cnum, n_in_common_, n_in_set_[0], n_in_set_[1]); + } + private: + typedef std::vector Uarray; + + unsigned int n_in_common_; ///< Number of frames in common + Uarray n_in_set_; ///< Number of frames just in specified set +}; + +/** Compare cluster # vs time sets. */ +int Exec_CompareClusters::CompareClusters(DataSet* c0, DataSet* c1) const { + std::vector clusterStats; + + DataSet_1D const& set0 = static_cast( *c0 ); + DataSet_1D const& set1 = static_cast( *c1 ); + + unsigned int n0 = set0.Size(); + unsigned int n1 = set1.Size(); + + if (n0 != n1) { + mprintf("Warning: Size of '%s' (%u) != size of '%s' (%u)\n", + c0->legend(), n0, c1->legend(), n1); + } + + unsigned int noise0 = 0; + unsigned int noise1 = 0; + unsigned int idx0 = 0; + unsigned int idx1 = 0; + while (idx0 < n0 && idx1 < n1) + { + // TODO check for fractional component + int cnum0 = (int)set0.Dval(idx0); + int cnum1 = (int)set1.Dval(idx1); + + if (cnum0 == -1) noise0++; + if (cnum1 == -1) noise1++; + + if (cnum0 != -1 && cnum0 == cnum1) { + // This frame was assigned the same cluster in both sets + if ((unsigned int)cnum0 >= clusterStats.size()) + clusterStats.resize(cnum0+1); + clusterStats[cnum0].FrameInCommon(); + } else { + // This frame was assigned different clusters + if (cnum0 != -1) { + if ((unsigned int)cnum0 >= clusterStats.size()) + clusterStats.resize(cnum0+1); + clusterStats[cnum0].IncrementSet( 0 ); + } + if (cnum1 != -1) { + if ((unsigned int)cnum1 >= clusterStats.size()) + clusterStats.resize(cnum1+1); + clusterStats[cnum1].IncrementSet( 1 ); + } + } + + idx0++; + idx1++; + } + + mprintf("\tSet 0: %u noise frames\n", noise0); + mprintf("\tSet 1: %u noise frames\n", noise1); + + ClusterStat::Header(); + for (std::vector::iterator it = clusterStats.begin(); + it != clusterStats.end(); ++it) + it->Print(it - clusterStats.begin()); + return 0; +} + +// Exec_CompareClusters::Help() +void Exec_CompareClusters::Help() const +{ + mprintf("\tset set \n"); +} + // Exec_CompareClusters::Execute() Exec::RetType Exec_CompareClusters::Execute(CpptrajState& State, ArgList& argIn) { @@ -38,5 +128,7 @@ Exec::RetType Exec_CompareClusters::Execute(CpptrajState& State, ArgList& argIn) mprintf("\tCluster set 0: '%s'\n", c0->legend()); mprintf("\tCluster set 1: '%s'\n", c1->legend()); + if (CompareClusters(c0, c1)) return CpptrajState::ERR; + return CpptrajState::OK; } diff --git a/src/Exec_CompareClusters.h b/src/Exec_CompareClusters.h index 29874fcfc1..7662aca9ee 100644 --- a/src/Exec_CompareClusters.h +++ b/src/Exec_CompareClusters.h @@ -10,5 +10,7 @@ class Exec_CompareClusters : public Exec { RetType Execute(CpptrajState&, ArgList&); private: static DataSet* getClusterSet(std::string const&, DataSetList const&); + + int CompareClusters(DataSet*, DataSet*) const; }; #endif diff --git a/src/cpptrajdepend b/src/cpptrajdepend index 3597353744..17b07b6590 100644 --- a/src/cpptrajdepend +++ b/src/cpptrajdepend @@ -247,7 +247,7 @@ Exec_Change.o : Exec_Change.cpp Action.h ActionList.h ActionState.h Analysis.h A Exec_ClusterMap.o : Exec_ClusterMap.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h ArrayIterator.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h ClusterMap.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_2D.h DataSet_Coords.h DataSet_Coords_REF.h DataSet_MatrixFlt.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_ClusterMap.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CombineCoords.o : Exec_CombineCoords.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CombineCoords.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_Commands.o : Exec_Commands.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_Commands.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h -Exec_CompareClusters.o : Exec_CompareClusters.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CompareClusters.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h +Exec_CompareClusters.o : Exec_CompareClusters.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_1D.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CompareClusters.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CompareTop.o : Exec_CompareTop.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CompareTop.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CrdAction.o : Exec_CrdAction.cpp Action.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Cmd.h CmdList.h Command.h CompactFrameArray.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_CRD.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CrdAction.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ProgressBar.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h Trajin.h TrajinList.h TrajoutList.h TypeNameHolder.h Unit.h Vec3.h Exec_CrdOut.o : Exec_CrdOut.cpp Action.h ActionFrameCounter.h ActionList.h ActionState.h Analysis.h AnalysisList.h AnalysisState.h ArgList.h AssociatedData.h Atom.h AtomMask.h AtomType.h BaseIOtype.h Box.h Constants.h CoordinateInfo.h CpptrajFile.h CpptrajState.h CpptrajStdio.h DataFile.h DataFileList.h DataSet.h DataSetList.h DataSet_Coords.h DataSet_Coords_REF.h Dimension.h DispatchObject.h EnsembleIn.h EnsembleOutList.h Exec.h Exec_CrdOut.h FileIO.h FileName.h FileTypes.h Frame.h FramePtrArray.h InputTrajCommon.h MaskToken.h Matrix_3x3.h MetaData.h Molecule.h NameType.h OutputTrajCommon.h Parallel.h ParameterHolders.h ParameterSet.h ParameterTypes.h ProgressBar.h Range.h ReferenceFrame.h ReplicaDimArray.h ReplicaInfo.h Residue.h Segment.h SymbolExporting.h TextFormat.h Timer.h Topology.h TrajFrameCounter.h TrajectoryFile.h Trajin.h TrajinList.h TrajoutList.h Trajout_Single.h TypeNameHolder.h Unit.h Vec3.h From dbbfc7b1a9cb9a84c87799e4becc59df3aede7eb Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 25 Jan 2022 16:03:40 -0500 Subject: [PATCH 03/23] Use time instead of wall time to seed RNG. Protect against int overflow of seed --- src/RNG.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/RNG.cpp b/src/RNG.cpp index 601b005b69..83e6d219e9 100644 --- a/src/RNG.cpp +++ b/src/RNG.cpp @@ -9,9 +9,15 @@ Cpptraj::RNG::RNG() : iseed_(-1) {} int Cpptraj::RNG::Set_Seed(int iseedIn) { // If iseed <= 0 set the seed based on wallclock time if (iseedIn <= 0 ) { - clock_t wallclock = clock(); - iseed_ = (int)wallclock; - mprintf("Random_Number: seed is <= 0, using wallclock time as seed (%i)\n", iseed_); + //clock_t wallclock = clock(); + //iseed_ = (int)wallclock; + time_t currentTime = time(0); + iseed_ = (int)currentTime; + // Since time_t/clock_t are typically something like unsigned long, + // need to protect against overflows. + if (iseed_ < 0) iseed_ = -iseed_; + //mprintf("Random_Number: seed is <= 0, using wallclock time as seed (%i)\n", iseed_); + mprintf("Random_Number: seed is <= 0, using time as seed (%i)\n", iseed_); } else iseed_ = iseedIn; // Set up specific to RNG From b1b3b0e1f582260fe5e4e4be90841e5bf74a0bf1 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 26 Jan 2022 08:59:14 -0500 Subject: [PATCH 04/23] Use wallclock time again, finer grained so less likelyhood of rapid runs in succession using the same seed I hope --- src/RNG.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/RNG.cpp b/src/RNG.cpp index 83e6d219e9..46b0d3755a 100644 --- a/src/RNG.cpp +++ b/src/RNG.cpp @@ -9,15 +9,15 @@ Cpptraj::RNG::RNG() : iseed_(-1) {} int Cpptraj::RNG::Set_Seed(int iseedIn) { // If iseed <= 0 set the seed based on wallclock time if (iseedIn <= 0 ) { - //clock_t wallclock = clock(); - //iseed_ = (int)wallclock; - time_t currentTime = time(0); - iseed_ = (int)currentTime; + clock_t wallclock = clock(); + iseed_ = (int)wallclock; + //time_t currentTime = time(0); + //iseed_ = (int)currentTime; // Since time_t/clock_t are typically something like unsigned long, // need to protect against overflows. if (iseed_ < 0) iseed_ = -iseed_; - //mprintf("Random_Number: seed is <= 0, using wallclock time as seed (%i)\n", iseed_); - mprintf("Random_Number: seed is <= 0, using time as seed (%i)\n", iseed_); + mprintf("Random_Number: seed is <= 0, using wallclock time as seed ( %i )\n", iseed_); + //mprintf("Random_Number: seed is <= 0, using time as seed (%i)\n", iseed_); } else iseed_ = iseedIn; // Set up specific to RNG From 2e6fd7c4d53e3c5b8d3e5cd43350341d5b4b8c9d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 26 Jan 2022 09:37:40 -0500 Subject: [PATCH 05/23] Add missing letter --- doc/cpptraj.lyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx index 116b5e5559..fac2fb66b5 100644 --- a/doc/cpptraj.lyx +++ b/doc/cpptraj.lyx @@ -41861,7 +41861,7 @@ The K-dist plot will be named ..dat, with the default prefix being The K-dist plot usually looks like a curve with an initially steep slope that gradually decreases. Around where the initial part of the curve starts to flatten out (indicating - an increas in density) is around where epsilon should be set; minpoints + an increase in density) is around where epsilon should be set; minpoints is set to whatever was. It has been suggested that the shape of the K-dist curve doesn't change too much after Kdist=4, but users are encouraged to experiment. From 993234a80895cc097c61560834c6a84ba5ac0035 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 08:34:16 -0500 Subject: [PATCH 06/23] Put DBI in separate file --- src/Cluster/DBI.cpp | 46 +++++++++++++++++++++++++++++++++++++++++++++ src/Cluster/DBI.h | 11 +++++++++++ 2 files changed, 57 insertions(+) create mode 100644 src/Cluster/DBI.cpp create mode 100644 src/Cluster/DBI.h diff --git a/src/Cluster/DBI.cpp b/src/Cluster/DBI.cpp new file mode 100644 index 0000000000..e935f7231f --- /dev/null +++ b/src/Cluster/DBI.cpp @@ -0,0 +1,46 @@ +#include +#include +#include +#include "DBI.h" +#include "List.h" +#include "Node.h" +#include "MetricArray.h" + +/** The Davies-Bouldin Index (DBI) measures the average similarity between each + * cluster and its most similar one; the smaller the DBI, the better. The DBI + * is defined as the average, for all clusters X, of fred, where fred(X) = max, + * across other clusters Y, of (Cx + Cy)/dXY. Here Cx is the average distance + * from points in X to the centroid, similarly Cy, and dXY is the distance + * between cluster centroids. + * NOTE: To use this, cluster centroids should be fully up-to-date. + */ +double Cpptraj::Cluster::ComputeDBI(List const& clusters, std::vector& averageDist, MetricArray& metricIn) +{ + averageDist.clear(); + averageDist.reserve( clusters.Nclusters() ); + for (List::cluster_iterator C1 = clusters.begincluster(); C1 != clusters.endcluster(); ++C1) { + // Calculate average distance to centroid for this cluster + averageDist.push_back( C1->CalcAvgToCentroid( metricIn ) ); + //if (outfile.IsOpen()) + // outfile.Printf("#Cluster %i has average-distance-to-centroid %f\n", + // C1->Num(), averageDist.back()); + } + double DBITotal = 0.0; + unsigned int nc1 = 0; + for (List::cluster_iterator c1 = clusters.begincluster(); c1 != clusters.endcluster(); ++c1, ++nc1) { + double MaxFred = 0; + unsigned int nc2 = 0; + for (List::cluster_iterator c2 = clusters.begincluster(); c2 != clusters.endcluster(); ++c2, ++nc2) { + if (c1 != c2) { + double Fred = averageDist[nc1] + averageDist[nc2]; + Fred /= metricIn.CentroidDist( c1->Cent(), c2->Cent() ); + if (Fred > MaxFred) + MaxFred = Fred; + } + } + DBITotal += MaxFred; + } + DBITotal /= (double)clusters.Nclusters(); + //if (outfile.IsOpen()) outfile.Printf("#DBI: %f\n", DBITotal); + return DBITotal; +} diff --git a/src/Cluster/DBI.h b/src/Cluster/DBI.h new file mode 100644 index 0000000000..72539fd541 --- /dev/null +++ b/src/Cluster/DBI.h @@ -0,0 +1,11 @@ +#ifndef INC_CLUSTER_DBI_H +#define INC_CLUSTER_DBI_H +namespace Cpptraj { +namespace Cluster { +class List; +class MetricArray; +/// Calculate the Davies-Bouldin index and average distance to centroid for each cluster +double ComputeDBI(List const&, std::vector&, MetricArray&); +} +} +#endif From 60c00984edb799d12e750f59d8cc4887344af9d9 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 08:40:28 -0500 Subject: [PATCH 07/23] Put pseudo F calc in separate file --- src/Cluster/PseudoF.cpp | 81 +++++++++++++++++++++++++++++++++++++++++ src/Cluster/PseudoF.h | 11 ++++++ 2 files changed, 92 insertions(+) create mode 100644 src/Cluster/PseudoF.cpp create mode 100644 src/Cluster/PseudoF.h diff --git a/src/Cluster/PseudoF.cpp b/src/Cluster/PseudoF.cpp new file mode 100644 index 0000000000..6faf7e4d19 --- /dev/null +++ b/src/Cluster/PseudoF.cpp @@ -0,0 +1,81 @@ +#include +#include "PseudoF.h" +#include "List.h" +#include "MetricArray.h" +#include "Node.h" +#include "../Constants.h" +#include "../CpptrajStdio.h" + +/** The pseudo-F statistic is another measure of clustering goodness. It is + * intended to capture the 'tightness' of clusters, and is in essence a ratio + * of the mean sum of squares between groups to the mean sum of squares within + * group (Lattin et al., 2003: 291) High values are good. Generally, one + * selects a cluster-count that gives a peak in the pseudo-f statistic (or + * pSF, for short). + * Formula: A/B, where A = (T - P)/(G-1), and B = P / (n-G). Here n is the + * number of points, G is the number of clusters, T is the total distance from + * the all-data centroid, and P is the sum (for all clusters) of the distances + * from the cluster centroid. + * NOTE: To use this, cluster centroids should be fully up-to-date. + * NOTE: This calc differs slightly from PTRAJ in that real centroids are used + * instead of representative structures. + */ +double Cpptraj::Cluster::ComputePseudoF(List const& clusters, double& SSRSST, MetricArray& metricIn, int debugIn) +{ + // Calculation makes no sense with fewer than 2 clusters. + if (clusters.Nclusters() < 2) { + mprintf("Warning: Fewer than 2 clusters. Not calculating pseudo-F.\n"); + return 0.0; + } + + // Form a cluster with all points to get a centroid. Use only frames that + // are in clusters, i.e. ignore noise. Assumes all cluster centroids are + // up to date. + Node c_all; + for (List::cluster_iterator C1 = clusters.begincluster(); C1 != clusters.endcluster(); ++C1) + { + for (Node::frame_iterator f1 = C1->beginframe(); f1 != C1->endframe(); ++f1) + c_all.AddFrameToCluster( *f1 ); + } + // Pseudo-F makes no sense if # clusters == # frames + if (clusters.Nclusters() == c_all.Nframes()) { + mprintf("Warning: Each frame is in a separate cluster. Not calculating pseudo-F.\n"); + return 0.0; + } + c_all.SortFrameList(); + c_all.CalculateCentroid( metricIn ); + + // Loop over all clusters + double gss = 0.0; // between-group sum of squares + double wss = 0.0; // within-group sum of squares + for (List::cluster_iterator C1 = clusters.begincluster(); C1 != clusters.endcluster(); ++C1) + { + for (Node::frame_iterator f1 = C1->beginframe(); f1 != C1->endframe(); ++f1) + { + double dist = metricIn.FrameCentroidDist(*f1, c_all.Cent()); + gss += (dist * dist); + dist = metricIn.FrameCentroidDist(*f1, C1->Cent()); + wss += (dist * dist); + } + } + double d_nclusters = (double)clusters.Nclusters(); + double d_ntotal = (double)c_all.Nframes(); + double num = (gss - wss) / (d_nclusters - 1.0); + double den = wss / (d_ntotal - d_nclusters); + if (den < Constants::SMALL) + den = Constants::SMALL; + double pseudof = num / den; + if (debugIn > 0) + mprintf("Pseudo-f: Total distance to centroid is %.4f\n" + "Pseudo-f: Cluster distance to centroid is %.4f\n" + "Pseudo-f: Numerator %.4f over denominator %.4f gives %.4f\n", + gss, wss, num, den, pseudof); + //if (outfile.IsOpen()) { + // outfile.Printf("#pSF: %f\n", pseudof); + // This calculation taken directly from ptraj + SSRSST = pseudof*(d_nclusters-1)/(d_ntotal-d_nclusters+pseudof*(d_nclusters-1)); + // outfile.Printf("#SSR/SST: %f\n", SSRSST); + //} + + return pseudof; +} diff --git a/src/Cluster/PseudoF.h b/src/Cluster/PseudoF.h new file mode 100644 index 0000000000..d7249f1a3d --- /dev/null +++ b/src/Cluster/PseudoF.h @@ -0,0 +1,11 @@ +#ifndef INC_CLUSTER_PSEUDOF_H +#define INC_CLUSTER_PSEUDOF_H +namespace Cpptraj { +namespace Cluster { +class List; +class MetricArray; +/// Calculate pseudo-F +double ComputePseudoF(List const&, double&, MetricArray&, int); +} +} +#endif From 03d50ea43e836c37fa8cb20f9f0b6e2cb044babb Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 09:20:12 -0500 Subject: [PATCH 08/23] Take DBI/pseudo-F calc out of list and store those metrics in Control, have them calculated at the end of the run instead of just when info happens. This way they can eventually be put into datasets. --- src/Cluster/CMakeLists.txt | 2 + src/Cluster/Control.cpp | 17 +++++- src/Cluster/Control.h | 6 ++ src/Cluster/List.cpp | 116 ------------------------------------- src/Cluster/List.h | 6 -- src/Cluster/Output.cpp | 12 ++-- src/Cluster/Output.h | 3 +- src/Cluster/PseudoF.cpp | 8 +++ src/Cluster/clusterdepend | 6 +- src/Cluster/clusterfiles | 2 + 10 files changed, 46 insertions(+), 132 deletions(-) diff --git a/src/Cluster/CMakeLists.txt b/src/Cluster/CMakeLists.txt index 75edebfd73..5d84ddfba3 100644 --- a/src/Cluster/CMakeLists.txt +++ b/src/Cluster/CMakeLists.txt @@ -12,6 +12,7 @@ set(CLUSTER_SOURCES Cmatrix_Binary.cpp Cmatrix_NC.cpp Control.cpp + DBI.cpp DrawGraph.cpp DynamicMatrix.cpp List.cpp @@ -23,6 +24,7 @@ set(CLUSTER_SOURCES Metric_Torsion.cpp Node.cpp Output.cpp + PseudoF.cpp Results_Coords.cpp Sieve.cpp Silhouette.cpp diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index d17003aef4..9fcaa0b26e 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -1,6 +1,8 @@ #include #include "Control.h" +#include "DBI.h" #include "Output.h" +#include "PseudoF.h" #include "../ArgList.h" #include "../BufferedLine.h" // For loading info file #include "../CpptrajStdio.h" @@ -44,7 +46,10 @@ Cpptraj::Cluster::Control::Control() : draw_tol_(0), draw_maxit_(0), debug_(0), - metricContribFile_(0) + metricContribFile_(0), + DBITotal_(0), + pseudoF_(0), + SSRSST_(0) {} /** DESTRUCTOR */ @@ -803,7 +808,13 @@ int Cpptraj::Cluster::Control::Run() { clusters_.PrintClusters(); } + // Clustering metrics. Centroids should be up to date. + DBITotal_ = ComputeDBI(clusters_, averageDist_, metrics_); + if (clusters_.Nclusters() > 1) + pseudoF_ = ComputePseudoF(clusters_, SSRSST_, metrics_, debug_); + // TODO assign reference names + timer_post_.Stop(); } timer_run_.Stop(); return 0; @@ -822,12 +833,14 @@ int Cpptraj::Cluster::Control::Output(DataSetList& DSL) { } // Info + if (!suppressInfo_) { CpptrajFile outfile; if (outfile.OpenWrite( clusterinfo_ )) return 1; timer_output_info_.Start(); Output::PrintClustersToFile(outfile, clusters_, *algorithm_, metrics_, - frameSieve_.SieveValue(), frameSieve_.FramesToCluster()); + frameSieve_.SieveValue(), frameSieve_.FramesToCluster(), + DBITotal_, averageDist_, pseudoF_, SSRSST_); timer_output_info_.Stop(); outfile.CloseFile(); } diff --git a/src/Cluster/Control.h b/src/Cluster/Control.h index dc364f5475..58cad50a9a 100644 --- a/src/Cluster/Control.h +++ b/src/Cluster/Control.h @@ -113,6 +113,12 @@ class Control { CpptrajFile* metricContribFile_; ///< If not null, determine how much each metric contributes to total distance + // Various metrics of clustering "goodness" + double DBITotal_; ///< Davies-Bouldin index (DBI) + std::vector averageDist_; ///< Average distance of each cluster to each other cluster (DBI) + double pseudoF_; ///< Pseudo-F (pSF) + double SSRSST_; ///< Sum of squares regression over total sum of squares. + // Timers Timer timer_setup_; ///< Run - metric, frames to cluster setup Timer timer_pairwise_; ///< Run - pairwise caching diff --git a/src/Cluster/List.cpp b/src/Cluster/List.cpp index 0a199a6eba..fd511af22d 100644 --- a/src/Cluster/List.cpp +++ b/src/Cluster/List.cpp @@ -6,7 +6,6 @@ #include "../CpptrajStdio.h" #include "../DataSet_integer.h" #include "../ProgressBar.h" -#include "../Constants.h" // SMALL TODO use limits? #ifdef _OPENMP # include #endif @@ -294,118 +293,3 @@ void Cpptraj::Cluster::List::AddFramesByCentroid(Cframes const& framesIn, Metric mprintf("\t%i of %i sieved frames were discarded as noise.\n", n_sieved_noise, Nsieved); } - -// ----------------------------------------------------------------------------- -/** The Davies-Bouldin Index (DBI) measures the average similarity between each - * cluster and its most similar one; the smaller the DBI, the better. The DBI - * is defined as the average, for all clusters X, of fred, where fred(X) = max, - * across other clusters Y, of (Cx + Cy)/dXY. Here Cx is the average distance - * from points in X to the centroid, similarly Cy, and dXY is the distance - * between cluster centroids. - * NOTE: To use this, cluster centroids should be fully up-to-date. - */ -double Cpptraj::Cluster::List::ComputeDBI(std::vector& averageDist, MetricArray& metricIn) -const -{ - averageDist.clear(); - averageDist.reserve( clusters_.size() ); - for (cluster_iterator C1 = begincluster(); C1 != endcluster(); ++C1) { - // Calculate average distance to centroid for this cluster - averageDist.push_back( C1->CalcAvgToCentroid( metricIn ) ); - //if (outfile.IsOpen()) - // outfile.Printf("#Cluster %i has average-distance-to-centroid %f\n", - // C1->Num(), averageDist.back()); - } - double DBITotal = 0.0; - unsigned int nc1 = 0; - for (cluster_iterator c1 = begincluster(); c1 != endcluster(); ++c1, ++nc1) { - double MaxFred = 0; - unsigned int nc2 = 0; - for (cluster_iterator c2 = begincluster(); c2 != endcluster(); ++c2, ++nc2) { - if (c1 != c2) { - double Fred = averageDist[nc1] + averageDist[nc2]; - Fred /= metricIn.CentroidDist( c1->Cent(), c2->Cent() ); - if (Fred > MaxFred) - MaxFred = Fred; - } - } - DBITotal += MaxFred; - } - DBITotal /= (double)clusters_.size(); - //if (outfile.IsOpen()) outfile.Printf("#DBI: %f\n", DBITotal); - return DBITotal; -} - -/** The pseudo-F statistic is another measure of clustering goodness. It is - * intended to capture the 'tightness' of clusters, and is in essence a ratio - * of the mean sum of squares between groups to the mean sum of squares within - * group (Lattin et al., 2003: 291) High values are good. Generally, one - * selects a cluster-count that gives a peak in the pseudo-f statistic (or - * pSF, for short). - * Formula: A/B, where A = (T - P)/(G-1), and B = P / (n-G). Here n is the - * number of points, G is the number of clusters, T is the total distance from - * the all-data centroid, and P is the sum (for all clusters) of the distances - * from the cluster centroid. - * NOTE: To use this, cluster centroids should be fully up-to-date. - * NOTE: This calc differs slightly from PTRAJ in that real centroids are used - * instead of representative structures. - */ -double Cpptraj::Cluster::List::ComputePseudoF(double& SSRSST, MetricArray& metricIn) const -{ - // Calculation makes no sense with fewer than 2 clusters. - if (Nclusters() < 2) { - mprintf("Warning: Fewer than 2 clusters. Not calculating pseudo-F.\n"); - return 0.0; - } - - // Form a cluster with all points to get a centroid. Use only frames that - // are in clusters, i.e. ignore noise. Assumes all cluster centroids are - // up to date. - Node c_all; - for (cluster_iterator C1 = begincluster(); C1 != endcluster(); ++C1) - { - for (Node::frame_iterator f1 = C1->beginframe(); f1 != C1->endframe(); ++f1) - c_all.AddFrameToCluster( *f1 ); - } - // Pseudo-F makes no sense if # clusters == # frames - if (Nclusters() == c_all.Nframes()) { - mprintf("Warning: Each frame is in a separate cluster. Not calculating pseudo-F.\n"); - return 0.0; - } - c_all.SortFrameList(); - c_all.CalculateCentroid( metricIn ); - - // Loop over all clusters - double gss = 0.0; // between-group sum of squares - double wss = 0.0; // within-group sum of squares - for (cluster_iterator C1 = begincluster(); C1 != endcluster(); ++C1) - { - for (Node::frame_iterator f1 = C1->beginframe(); f1 != C1->endframe(); ++f1) - { - double dist = metricIn.FrameCentroidDist(*f1, c_all.Cent()); - gss += (dist * dist); - dist = metricIn.FrameCentroidDist(*f1, C1->Cent()); - wss += (dist * dist); - } - } - double d_nclusters = (double)Nclusters(); - double d_ntotal = (double)c_all.Nframes(); - double num = (gss - wss) / (d_nclusters - 1.0); - double den = wss / (d_ntotal - d_nclusters); - if (den < Constants::SMALL) - den = Constants::SMALL; - double pseudof = num / den; - if (debug_ > 0) - mprintf("Pseudo-f: Total distance to centroid is %.4f\n" - "Pseudo-f: Cluster distance to centroid is %.4f\n" - "Pseudo-f: Numerator %.4f over denominator %.4f gives %.4f\n", - gss, wss, num, den, pseudof); - //if (outfile.IsOpen()) { - // outfile.Printf("#pSF: %f\n", pseudof); - // This calculation taken directly from ptraj - SSRSST = pseudof*(d_nclusters-1)/(d_ntotal-d_nclusters+pseudof*(d_nclusters-1)); - // outfile.Printf("#SSR/SST: %f\n", SSRSST); - //} - - return pseudof; -} diff --git a/src/Cluster/List.h b/src/Cluster/List.h index 31018e1ee4..dfc1d4e376 100644 --- a/src/Cluster/List.h +++ b/src/Cluster/List.h @@ -1,6 +1,5 @@ #ifndef INC_CLUSTER_LIST_H #define INC_CLUSTER_LIST_H -//#include #include "Cframes.h" class DataSet_integer; namespace Cpptraj { @@ -66,11 +65,6 @@ class List { int CreateCnumVsTime(DataSet_integer&, unsigned int, int, int) const; /// Generate number unique clusters vs time data set int NclustersObserved(DataSet_integer&, unsigned int, int) const; - - /// Calculate the Davies-Bouldin index. - double ComputeDBI(std::vector&, MetricArray&) const; - /// Calculate pseudo-F - double ComputePseudoF(double&, MetricArray&) const; private: typedef std::list Narray; Narray clusters_; ///< Hold all clusters. diff --git a/src/Cluster/Output.cpp b/src/Cluster/Output.cpp index b084204f30..8d9dc46755 100644 --- a/src/Cluster/Output.cpp +++ b/src/Cluster/Output.cpp @@ -30,7 +30,9 @@ void Cpptraj::Cluster::Output::PrintClustersToFile(CpptrajFile& outfile, List const& clusters, Algorithm const& algorithmIn, MetricArray& metricIn, int sieve, - Cframes const& sievedFrames) + Cframes const& sievedFrames, + double DBITotal, std::vector const& averageDist, + double pseudof, double SSRSST) { //CpptrajFile outfile; std::string buffer; @@ -43,8 +45,8 @@ void Cpptraj::Cluster::Output::PrintClustersToFile(CpptrajFile& outfile, outfile.Printf("#Clustering: %i clusters %u frames\n", clusters.Nclusters(), metricIn.Ntotal()); // DBI - std::vector averageDist; - double DBITotal = clusters.ComputeDBI( averageDist, metricIn ); + //std::vector averageDist; + //double DBITotal = clusters.ComputeDBI( averageDist, metricIn ); std::vector::const_iterator avgd = averageDist.begin(); for (List::cluster_iterator C = clusters.begincluster(); C != clusters.endcluster(); @@ -53,8 +55,8 @@ void Cpptraj::Cluster::Output::PrintClustersToFile(CpptrajFile& outfile, outfile.Printf("#DBI: %f\n", DBITotal); // Pseudo-F if (clusters.Nclusters() > 1) { - double SSRSST = 0.0; - double pseudof = clusters.ComputePseudoF( SSRSST, metricIn ); + //double SSRSST = 0.0; + //double pseudof = clusters.ComputePseudoF( SSRSST, metricIn ); outfile.Printf("#pSF: %f\n", pseudof); outfile.Printf("#SSR/SST: %f\n", SSRSST); } else diff --git a/src/Cluster/Output.h b/src/Cluster/Output.h index 033e212f53..0ec1c91702 100644 --- a/src/Cluster/Output.h +++ b/src/Cluster/Output.h @@ -12,7 +12,8 @@ class BestReps; class Output { public: static void PrintClustersToFile(CpptrajFile&, List const&, Algorithm const&, MetricArray&, - int, Cframes const&); + int, Cframes const&, double, std::vector const&, + double, double); static int Summary(CpptrajFile&, List const&, Algorithm const&, MetricArray&, bool, bool, std::vector const&); static void Summary_Part(CpptrajFile&, unsigned int, Cframes const&, List const&, diff --git a/src/Cluster/PseudoF.cpp b/src/Cluster/PseudoF.cpp index 6faf7e4d19..abc91c7628 100644 --- a/src/Cluster/PseudoF.cpp +++ b/src/Cluster/PseudoF.cpp @@ -16,9 +16,17 @@ * number of points, G is the number of clusters, T is the total distance from * the all-data centroid, and P is the sum (for all clusters) of the distances * from the cluster centroid. + * This will also calculate SSR (sum of squares regression, between sum of + * squares) over the total sum of squares (SST). The SSR/SST ratio value lies + * between 0 and 1 and gives the percentage of explained variance by the data, + * and is similar to the R 2 value in regression analysis. As the ratio + * inherently rises with cluster count, one looks for an “elbow” in the curve + * where adding another cluster does not add much new information. * NOTE: To use this, cluster centroids should be fully up-to-date. * NOTE: This calc differs slightly from PTRAJ in that real centroids are used * instead of representative structures. + * \param clusters List of clusters + * \param SSRSST sum of squares regression (SSR or between sum of squares) over total sum of squares (SST). */ double Cpptraj::Cluster::ComputePseudoF(List const& clusters, double& SSRSST, MetricArray& metricIn, int debugIn) { diff --git a/src/Cluster/clusterdepend b/src/Cluster/clusterdepend index 909eb6cec8..1fafe4d710 100644 --- a/src/Cluster/clusterdepend +++ b/src/Cluster/clusterdepend @@ -9,10 +9,11 @@ Centroid_Coord.o : Centroid_Coord.cpp ../Atom.h ../AtomMask.h ../Box.h ../Coordi Cframes.o : Cframes.cpp Cframes.h Cmatrix_Binary.o : Cmatrix_Binary.cpp ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Parallel.h Cframes.h Cmatrix_Binary.h Cmatrix_NC.o : Cmatrix_NC.cpp ../CpptrajStdio.h ../FileName.h ../NC_Routines.h Cframes.h Cmatrix_NC.h -Control.o : Control.cpp ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../BufferedLine.h ../Cluster/Cframes.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataFile.h ../DataFileList.h ../DataSet.h ../DataSetList.h ../DataSet_1D.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../DataSet_PairwiseCache.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Random.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Algorithm.h Algorithm_DBscan.h Algorithm_DPeaks.h Algorithm_HierAgglo.h Algorithm_Kmeans.h BestReps.h CentroidArray.h Cframes.h Control.h DrawGraph.h DynamicMatrix.h List.h Metric.h MetricArray.h Node.h Output.h Results.h Results_Coords.h Sieve.h Silhouette.h +Control.o : Control.cpp ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../BufferedLine.h ../Cluster/Cframes.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataFile.h ../DataFileList.h ../DataSet.h ../DataSetList.h ../DataSet_1D.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../DataSet_PairwiseCache.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Random.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Algorithm.h Algorithm_DBscan.h Algorithm_DPeaks.h Algorithm_HierAgglo.h Algorithm_Kmeans.h BestReps.h CentroidArray.h Cframes.h Control.h DBI.h DrawGraph.h DynamicMatrix.h List.h Metric.h MetricArray.h Node.h Output.h PseudoF.h Results.h Results_Coords.h Sieve.h Silhouette.h +DBI.o : DBI.cpp CentroidArray.h Cframes.h DBI.h List.h Metric.h MetricArray.h Node.h DrawGraph.o : DrawGraph.cpp ../AssociatedData.h ../Atom.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../NameType.h ../PDBfile.h ../Parallel.h ../Range.h ../Residue.h ../SymbolExporting.h ../TextFormat.h ../Vec3.h Cframes.h DrawGraph.h Metric.h MetricArray.h DynamicMatrix.o : DynamicMatrix.cpp ../ArrayIterator.h ../CpptrajStdio.h ../Matrix.h DynamicMatrix.h -List.o : List.cpp ../AssociatedData.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../ProgressBar.h ../Range.h ../TextFormat.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h +List.o : List.cpp ../AssociatedData.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../ProgressBar.h ../Range.h ../TextFormat.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h MetricArray.o : MetricArray.cpp ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMap.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../Cluster/Cframes.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataFile.h ../DataFileList.h ../DataSet.h ../DataSetList.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../DataSet_PairwiseCache.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../Hungarian.h ../MapAtom.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../OnlineVarT.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../ProgressBar.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../StringRoutines.h ../SymbolExporting.h ../SymmetricRmsdCalc.h ../TextFormat.h ../Timer.h ../Topology.h ../TypeNameHolder.h ../Unit.h ../Vec3.h CentroidArray.h Metric.h MetricArray.h Metric_DME.h Metric_RMS.h Metric_SRMSD.h Metric_Scalar.h Metric_Torsion.h Metric_DME.o : Metric_DME.cpp ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../Box.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_Coords.h ../Dimension.h ../FileIO.h ../FileName.h ../Frame.h ../MaskToken.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Range.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Topology.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Centroid.h Centroid_Coord.h Cframes.h Metric.h Metric_DME.h Metric_RMS.o : Metric_RMS.cpp ../AssociatedData.h ../Atom.h ../AtomMask.h ../AtomType.h ../Box.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_Coords.h ../Dimension.h ../FileIO.h ../FileName.h ../Frame.h ../MaskToken.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Range.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../SymbolExporting.h ../TextFormat.h ../Topology.h ../TypeNameHolder.h ../Unit.h ../Vec3.h Centroid.h Centroid_Coord.h Cframes.h Metric.h Metric_RMS.h @@ -21,6 +22,7 @@ Metric_Scalar.o : Metric_Scalar.cpp ../AssociatedData.h ../CpptrajFile.h ../Cppt Metric_Torsion.o : Metric_Torsion.cpp ../AssociatedData.h ../Constants.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSet_1D.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Range.h ../TextFormat.h Centroid.h Centroid_Num.h Cframes.h Metric.h Metric_Torsion.h Node.o : Node.cpp ../AssociatedData.h ../CpptrajFile.h ../DataSet.h ../DataSet_1D.h ../DataSet_float.h ../DataSet_integer.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Range.h ../TextFormat.h CentroidArray.h Cframes.h Metric.h MetricArray.h Node.h Output.o : Output.cpp ../ArrayIterator.h ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Matrix.h ../Parallel.h ../Timer.h Algorithm.h BestReps.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h Output.h +PseudoF.o : PseudoF.cpp ../Constants.h ../CpptrajStdio.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h PseudoF.h Results_Coords.o : Results_Coords.cpp ../ActionFrameCounter.h ../ArgList.h ../ArrayIterator.h ../AssociatedData.h ../Atom.h ../AtomMap.h ../AtomMask.h ../AtomType.h ../BaseIOtype.h ../Box.h ../Constants.h ../CoordinateInfo.h ../CpptrajFile.h ../CpptrajStdio.h ../DataSet.h ../DataSetList.h ../DataSet_Coords.h ../DataSet_Coords_REF.h ../Dimension.h ../FileIO.h ../FileName.h ../FileTypes.h ../Frame.h ../Hungarian.h ../MapAtom.h ../MaskToken.h ../Matrix.h ../Matrix_3x3.h ../MetaData.h ../Molecule.h ../NameType.h ../OutputTrajCommon.h ../Parallel.h ../ParameterHolders.h ../ParameterSet.h ../ParameterTypes.h ../Range.h ../ReferenceFrame.h ../ReplicaDimArray.h ../Residue.h ../Segment.h ../StringRoutines.h ../SymbolExporting.h ../SymmetricRmsdCalc.h ../TextFormat.h ../Timer.h ../Topology.h ../TrajectoryFile.h ../Trajout_Single.h ../TypeNameHolder.h ../Unit.h ../Vec3.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Metric_DME.h Metric_RMS.h Metric_SRMSD.h Node.h Results.h Results_Coords.h Sieve.o : Sieve.cpp ../AssociatedData.h ../Cluster/Cframes.h ../CpptrajFile.h ../DataSet.h ../DataSet_PairwiseCache.h ../Dimension.h ../FileIO.h ../FileName.h ../MetaData.h ../Parallel.h ../Random.h ../Range.h ../TextFormat.h Cframes.h Sieve.h Silhouette.o : Silhouette.cpp ../CpptrajFile.h ../CpptrajStdio.h ../FileIO.h ../FileName.h ../Parallel.h CentroidArray.h Cframes.h List.h Metric.h MetricArray.h Node.h Silhouette.h diff --git a/src/Cluster/clusterfiles b/src/Cluster/clusterfiles index c1f870778a..293bad8a4c 100644 --- a/src/Cluster/clusterfiles +++ b/src/Cluster/clusterfiles @@ -12,6 +12,7 @@ CLUSTER_SOURCES= \ Cmatrix_Binary.cpp \ Cmatrix_NC.cpp \ Control.cpp \ + DBI.cpp \ DrawGraph.cpp \ DynamicMatrix.cpp \ List.cpp \ @@ -23,6 +24,7 @@ CLUSTER_SOURCES= \ Metric_Torsion.cpp \ Node.cpp \ Output.cpp \ + PseudoF.cpp \ Results_Coords.cpp \ Sieve.cpp \ Silhouette.cpp From a1481051f0cdcd525920ee2db4bffb4c975945cb Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 09:55:25 -0500 Subject: [PATCH 09/23] Allow cpptraj to be set via env var --- devtools/benchmark/cluster/cluster.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/devtools/benchmark/cluster/cluster.sh b/devtools/benchmark/cluster/cluster.sh index d310814bb3..8b8a9d9c5d 100755 --- a/devtools/benchmark/cluster/cluster.sh +++ b/devtools/benchmark/cluster/cluster.sh @@ -1,8 +1,10 @@ #!/bin/bash -CPPTRAJ=`which cpptraj` -#export OMP_NUM_THREADS=12 -#CPPTRAJ=`which cpptraj.OMP` +if [ -z "$CPPTRAJ" ] ; then + CPPTRAJ=`which cpptraj` + #export OMP_NUM_THREADS=12 + #CPPTRAJ=`which cpptraj.OMP` +fi TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7 TRJ=~/Cpptraj/Cpptraj-ExtendedTests/run9.nc # 2000 frames From fbc13e9abda4f3da67188469f1212cddb3169c81 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 10:26:28 -0500 Subject: [PATCH 10/23] Add data sets for DBI and pseudo-F --- src/Cluster/Control.cpp | 16 ++++++++++++++-- src/Cluster/Control.h | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index 9fcaa0b26e..d6142f9a0f 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -49,7 +49,9 @@ Cpptraj::Cluster::Control::Control() : metricContribFile_(0), DBITotal_(0), pseudoF_(0), - SSRSST_(0) + SSRSST_(0), + dbi_set_(0), + psf_set_(0) {} /** DESTRUCTOR */ @@ -488,6 +490,12 @@ int Cpptraj::Cluster::Control::SetupClustering(DataSetList const& setsToCluster, if (cnumvtime_ == 0) return 1; if (cnumvtimefile != 0) cnumvtimefile->AddDataSet( cnumvtime_ ); + // DBI and pSF data sets + dbi_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "DBI")); + if (dbi_set_ == 0) return 1; + psf_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "PSF")); + if (psf_set_ == 0) return 1; + // If cache was allocated, add to the DataSetList so it is after cnumvtime for pytraj if (metrics_.CacheWasAllocated()) DSL.AddSet( metrics_.CachePtr() ); @@ -810,8 +818,12 @@ int Cpptraj::Cluster::Control::Run() { // Clustering metrics. Centroids should be up to date. DBITotal_ = ComputeDBI(clusters_, averageDist_, metrics_); - if (clusters_.Nclusters() > 1) + mprintf("DEBUG: DBI %f\n", DBITotal_); + dbi_set_->Add(0, &DBITotal_); + if (clusters_.Nclusters() > 1) { pseudoF_ = ComputePseudoF(clusters_, SSRSST_, metrics_, debug_); + psf_set_->Add(0, &pseudoF_); + } // TODO assign reference names timer_post_.Stop(); diff --git a/src/Cluster/Control.h b/src/Cluster/Control.h index 58cad50a9a..193a0e062f 100644 --- a/src/Cluster/Control.h +++ b/src/Cluster/Control.h @@ -118,6 +118,8 @@ class Control { std::vector averageDist_; ///< Average distance of each cluster to each other cluster (DBI) double pseudoF_; ///< Pseudo-F (pSF) double SSRSST_; ///< Sum of squares regression over total sum of squares. + DataSet* dbi_set_; ///< DataSet to store DBI. + DataSet* psf_set_; ///< DataSet to store pseudo-F. // Timers Timer timer_setup_; ///< Run - metric, frames to cluster setup From effc938da7d16fc0c1804d4375ad622fe7a7cf05 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 10:30:37 -0500 Subject: [PATCH 11/23] Store SSR/SST value in set. Update manual. --- doc/cpptraj.lyx | 12 ++++++++++++ src/Cluster/Control.cpp | 6 +++++- src/Cluster/Control.h | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/cpptraj.lyx b/doc/cpptraj.lyx index fac2fb66b5..31cbc379e9 100644 --- a/doc/cpptraj.lyx +++ b/doc/cpptraj.lyx @@ -41638,6 +41638,18 @@ gracecolor specified). \end_layout +\begin_layout Description +[DBI] Hold final Davies-Bouldin index. +\end_layout + +\begin_layout Description +[PSF] Hold final pseudo-F value. +\end_layout + +\begin_layout Description +[SSRSST] Hold final SSR/SST value. +\end_layout + \begin_layout Description [NCVT] ( \series bold diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index d6142f9a0f..95330438fc 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -51,7 +51,8 @@ Cpptraj::Cluster::Control::Control() : pseudoF_(0), SSRSST_(0), dbi_set_(0), - psf_set_(0) + psf_set_(0), + ssrsst_set_(0) {} /** DESTRUCTOR */ @@ -495,6 +496,8 @@ int Cpptraj::Cluster::Control::SetupClustering(DataSetList const& setsToCluster, if (dbi_set_ == 0) return 1; psf_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "PSF")); if (psf_set_ == 0) return 1; + ssrsst_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "SSRSST")); + if (ssrsst_set_ == 0) return 1; // If cache was allocated, add to the DataSetList so it is after cnumvtime for pytraj if (metrics_.CacheWasAllocated()) @@ -823,6 +826,7 @@ int Cpptraj::Cluster::Control::Run() { if (clusters_.Nclusters() > 1) { pseudoF_ = ComputePseudoF(clusters_, SSRSST_, metrics_, debug_); psf_set_->Add(0, &pseudoF_); + ssrsst_set_->Add(0, &SSRSST_); } // TODO assign reference names diff --git a/src/Cluster/Control.h b/src/Cluster/Control.h index 193a0e062f..557da5a7e1 100644 --- a/src/Cluster/Control.h +++ b/src/Cluster/Control.h @@ -120,6 +120,7 @@ class Control { double SSRSST_; ///< Sum of squares regression over total sum of squares. DataSet* dbi_set_; ///< DataSet to store DBI. DataSet* psf_set_; ///< DataSet to store pseudo-F. + DataSet* ssrsst_set_; ///< DataSet to store SSR/SST. // Timers Timer timer_setup_; ///< Run - metric, frames to cluster setup From 813a3e98f51fede1bfc0f09c31efc102ade508b4 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Thu, 27 Jan 2022 14:15:51 -0500 Subject: [PATCH 12/23] Fix TIMER compile --- src/DataSetList.cpp | 2 ++ src/DataSetList.h | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/DataSetList.cpp b/src/DataSetList.cpp index af0b2f162f..9486970992 100644 --- a/src/DataSetList.cpp +++ b/src/DataSetList.cpp @@ -435,7 +435,9 @@ DataSet* DataSetList::AddSet( DataSet::DataType inType, MetaData const& metaIn, * \return pointer to successfully set-up DataSet or 0 if error. */ DataSet* DataSetList::AllocateSet(DataSet::DataType inType, MetaData const& metaIn) +#ifndef TIMER const +#endif { // TODO Always generate default name if empty? # ifdef TIMER time_total_.Start(); diff --git a/src/DataSetList.h b/src/DataSetList.h index 8fb262f986..c13d63fe66 100644 --- a/src/DataSetList.h +++ b/src/DataSetList.h @@ -108,8 +108,13 @@ class DataSetList { DataSet* AddSet_NoCheck(DataSet::DataType, MetaData const&); /// Add an already set up DataSet to list; memory for DataSet will be freed. int AddSet( DataSet* ); - /// Allocate DataSet but do not add to the list +# ifdef TIMER + /// Allocate DataSet but do not add to the list. Not const so that Timer can be used. + DataSet* AllocateSet(DataSet::DataType, MetaData const&); +# else + /// Allocate DataSet but do not add to the list. DataSet* AllocateSet(DataSet::DataType, MetaData const&) const; +# endif /// Add new sets or append to existing ones. int AddOrAppendSets(std::string const&, Darray const&, DataListType const&); /// Add a copy of the DataSet to the list; memory for DataSet will not be freed. From 6aad51075b443a951f0cb512ff1706b0fd475e28 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 28 Jan 2022 10:59:40 -0500 Subject: [PATCH 13/23] Start exploring a different way of speeding up finding cluster to cluster min distance by keeping track of closest cluster to each cluster and updating as needed. --- src/Cluster/Algorithm_HierAgglo.cpp | 14 +++- src/Cluster/DynamicMatrix.cpp | 86 ++++++++++++++++++++ src/Cluster/DynamicMatrix.h | 119 +++++++++++++++++++++++++++- 3 files changed, 212 insertions(+), 7 deletions(-) diff --git a/src/Cluster/Algorithm_HierAgglo.cpp b/src/Cluster/Algorithm_HierAgglo.cpp index 39defa273f..b247f4285e 100644 --- a/src/Cluster/Algorithm_HierAgglo.cpp +++ b/src/Cluster/Algorithm_HierAgglo.cpp @@ -69,9 +69,9 @@ void Cpptraj::Cluster::Algorithm_HierAgglo::Results(CpptrajFile& outfile) const /** Print clustering timing. */ void Cpptraj::Cluster::Algorithm_HierAgglo::Timing(double total) const { # ifdef TIMER - time_findMin_.WriteTiming(2, "Find min distance", total); - time_mergeFrames_.WriteTiming(2, "Merge cluster frames", total); - time_calcLinkage_.WriteTiming(2, "Calculate new linkage", total); + time_findMin_.WriteTiming(2, "Find min distance :", total); + time_mergeFrames_.WriteTiming(2, "Merge cluster frames :", total); + time_calcLinkage_.WriteTiming(2, "Calculate new linkage :", total); # endif } @@ -132,6 +132,9 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters, ClusterDistances_.SetCdist( C1_it->Num(), C2_it->Num(), dval ); } } + // DEBUG print closest indices + ClusterDistances_.PrintElementsSquare(); + ClusterDistances_.PrintClosest(); //InitializeClusterDistances(); // DEBUG - print initial clusters if (debug_ > 1) @@ -148,7 +151,10 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters, break; } if (clusters.Nclusters() == 1) clusteringComplete = true; // Sanity check - cluster_progress.Update( iterations++ ); +// cluster_progress.Update( iterations++ ); // FIXME reenable + // DEBUG print closest indices + ClusterDistances_.PrintElementsSquare(); + ClusterDistances_.PrintClosest(); } mprintf("\tCompleted after %i iterations, %u clusters.\n",iterations, clusters.Nclusters()); diff --git a/src/Cluster/DynamicMatrix.cpp b/src/Cluster/DynamicMatrix.cpp index 49245ca751..788f045359 100644 --- a/src/Cluster/DynamicMatrix.cpp +++ b/src/Cluster/DynamicMatrix.cpp @@ -61,6 +61,34 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { } } } + mprintf("DEBUG: Min found at row %i col %i (%f)\n", iOut, jOut, min); + + + float currentMin = std::numeric_limits::max(); + int minRow = -1; + int minCol = -1; + for (unsigned int col = 0; col != closestIdx_.size(); col++) + { + int row = closestIdx_[col]; + if (!ignore_[col] && !ignore_[row]) { + float mval = Mat_.element(col, row); + if (mval < currentMin) { + currentMin = mval; + minRow = (int)row; + minCol = (int)col; + } + } + } + if (minCol < minRow) { //FIXME i think only needed for testing + int itmp = minCol; + minCol = minRow; + minRow = itmp; + } + mprintf("DEBUG: Second try: Min found at row %i col %i (%f)\n", minRow, minCol, currentMin); + + if (iOut != minRow || jOut != minCol) + mprintf("Error: Column/row mismatch.\n"); + return (double)min; } #endif @@ -80,10 +108,68 @@ void Cpptraj::Cluster::DynamicMatrix::PrintElements() const { } } +/** Print elements like a square. */ +void Cpptraj::Cluster::DynamicMatrix::PrintElementsSquare() const { +/* mprintf("%4s", ""); + for (unsigned int col = 0; col < Mat_.Ncols(); col++) + mprintf(" %12u", col); + mprintf("\n"); + for (unsigned int row = 0; row < Mat_.Nrows(); row++) { + mprintf("%4u", row); + for (unsigned int col = 0; col < Mat_.Ncols(); col++) { + if (row <= col) + mprintf(" %12s", "X"); + else if (ignore_[row] || ignore_[col]) + mprintf(" %12s", "i"); + else + mprintf(" %12.4f", Mat_.element(col, row)); + } + mprintf("\n"); + }*/ + for (unsigned int row = 0; row < Mat_.Nrows(); row++) { + if (!ignore_[row]) { + mprintf("%4u :", row); + for (unsigned int col = 0; col < Mat_.Ncols(); col++) { + if (col != row && !ignore_[col]) + mprintf(" %4u=%8.3f", col, Mat_.element(col, row)); + } + mprintf("\n"); + } + } +} + +/** Print closest indices to STDOUT. */ +void Cpptraj::Cluster::DynamicMatrix::PrintClosest() const { + for (unsigned int idx = 0; idx != closestIdx_.size(); idx++) { + if (!ignore_[idx]) { + mprintf("DEBUG: Closest cluster to %u is %i (%f)\n", idx, closestIdx_[idx], Mat_.element(idx, closestIdx_[idx])); + // Check that this is actually the case + int cidx = -1; + float cdist = 0; + for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { + if (idx != jdx && !ignore_[jdx]) { + float fdist = Mat_.element(idx, jdx); + if (cidx == -1) { + cidx = jdx; + cdist = fdist; + } else if (fdist < cdist) { + cidx = jdx; + cdist = fdist; + } + } + } + if (cidx != closestIdx_[idx]) { + mprintf("Error: ACTUAL closest idx to %u is %i (%f)\n", idx, cidx, cdist); + } + } + } +} + // DynamicMatrix::SetupMatrix() int Cpptraj::Cluster::DynamicMatrix::SetupMatrix(size_t sizeIn) { if (Mat_.resize( 0L, sizeIn )) return 1; ignore_.assign( sizeIn, false ); + closestIdx_.assign( sizeIn, -1 ); # ifdef _OPENMP int n_threads = 0; # pragma omp parallel diff --git a/src/Cluster/DynamicMatrix.h b/src/Cluster/DynamicMatrix.h index bcfee8be0c..45244cf9a6 100644 --- a/src/Cluster/DynamicMatrix.h +++ b/src/Cluster/DynamicMatrix.h @@ -2,6 +2,7 @@ #define INC_CLUSTER_DYNAMICMATRIX_H #include #include "../Matrix.h" +#include "../CpptrajStdio.h" // DEBUG namespace Cpptraj { namespace Cluster { @@ -10,7 +11,8 @@ class DynamicMatrix { public: DynamicMatrix() {} /// Indicate given row/col should be ignored. - void Ignore(int row) { ignore_[row] = true; } + //void Ignore(int row) { ignore_[row] = true; } + inline void Ignore(int); /// \return true if given row/col has been ignored. bool IgnoringRow(int row) const { return ignore_[row]; } /// \return Original number of rows in matrix @@ -25,15 +27,23 @@ class DynamicMatrix { inline double GetCdist(int c, int r) const { return Mat_.element(c,r); } /// Print all matrix elements to STDOUT void PrintElements() const; + /// Print matrix elements to STDOUT as a square + void PrintElementsSquare() const; + /// Print closest index to each col to STDOUT + void PrintClosest() const; /// Add given element to matrix. - int AddCdist(double d) { return Mat_.addElement((float)d); } + //int AddCdist(double d) { return Mat_.addElement((float)d); } /// Set element at column/row to given value - void SetCdist(int col, int row, double val) { Mat_.setElement(col, row, val); } + //void SetCdist(int col, int row, double val) { Mat_.setElement(col, row, val); } + inline void SetCdist(int, int, float); /// Set up matrix for given number of rows int SetupMatrix(size_t); private: + inline void updateClosestIdx(unsigned int); + Matrix Mat_; ///< Upper-triangle matrix holding cluster distances. std::vector ignore_; ///< If true, ignore the row/col when printing/searching etc. + std::vector closestIdx_; ///< For each cluster, index of the cluster closest to it # ifdef _OPENMP std::vector minRow_; std::vector minCol_; @@ -41,6 +51,109 @@ class DynamicMatrix { # endif }; +/** Update closest to idx */ +void DynamicMatrix::updateClosestIdx(unsigned int idx) { + closestIdx_[idx] = -1; + float current_closest = 0; + for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { + if (!ignore_[jdx] && idx != jdx) { + if (closestIdx_[idx] == -1) { + closestIdx_[idx] = jdx; + current_closest = Mat_.element(idx, jdx); + } else { + float fdist = Mat_.element(idx, jdx); + if (fdist < current_closest) { + closestIdx_[idx] = jdx; + current_closest = fdist; + } + } + } + } // END loop jdx + mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]); +} + +/** Set cluster distance. Record closest distance for each col. */ +void DynamicMatrix::SetCdist(int col, int row, float val) { + mprintf("DEBUG: Setting cluster %i to cluster %i distance= %f\n", col, row, val); + + int update_col = -1; + int update_row = -1; + + // Check col->row distance + if (closestIdx_[col] < 0) { + mprintf("DEBUG: This is the initial distance for col.\n"); + closestIdx_[col] = row; + } else { + float closest_dist_to_col = Mat_.element(col, closestIdx_[col]); + mprintf("DEBUG: Current closest to %i is %i (%f)\n", col, closestIdx_[col], closest_dist_to_col); + if (val < closest_dist_to_col) { + mprintf("DEBUG: Updated col.\n"); + closestIdx_[col] = row; + } else if (row == closestIdx_[col] && val > closest_dist_to_col) { + // We are increasing what was previously the closest distance. May need to find new closest distance to col. + update_col = col; + } + } + + // Check row->col distance + if (closestIdx_[row] < 0) { + mprintf("DEBUG: This is the initial distance for row.\n"); + closestIdx_[row] = col; + } else { + float closest_dist_to_row = Mat_.element(row, closestIdx_[row]); + mprintf("DEBUG: Current closest to %i is %i (%f)\n", row, closestIdx_[row], closest_dist_to_row); + if (val < closest_dist_to_row) { + mprintf("DEBUG: Updated row.\n"); + closestIdx_[row] = col; + } else if (col == closestIdx_[row] && val > closest_dist_to_row) { + // We are increasing what was previously the closest distance. May need to find new closest distance to row. + update_row = row; + } + } + + // Actually update the element + Mat_.setElement(col, row, val); + + // Update closest if necessary + if (update_col != -1) { + mprintf("DEBUG: Closest to column %i increased. Need to update.\n", update_col); + updateClosestIdx(update_col); + } + if (update_row != -1) { + mprintf("DEBUG: Closest to row %i increased. Need to update.\n", update_row); + updateClosestIdx(update_row); + } +} + +/** Specify given row should be ignored. Update closest distances if necessary. */ +void DynamicMatrix::Ignore(int row) { + ignore_[row] = true; + // If any remaining indices have the ignored row that needs to be updated + for (unsigned int idx = 0; idx != closestIdx_.size(); idx++) { + if (!ignore_[idx] && closestIdx_[idx] == row) { + mprintf("DEBUG: closest to %u was just ignored; must be updated.\n", idx); + updateClosestIdx(idx); +/* closestIdx_[idx] = -1; + float current_closest = 0; + for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { + if (!ignore_[jdx] && idx != jdx) { + if (closestIdx_[idx] == -1) { + closestIdx_[idx] = jdx; + current_closest = Mat_.element(idx, jdx); + } else { + float fdist = Mat_.element(idx, jdx); + if (fdist < current_closest) { + closestIdx_[idx] = jdx; + current_closest = fdist; + } + } + } + } // END loop jdx + mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]);*/ + } + } // END loop idx +} + } } #endif From 2dd9db60ae70940fc135458e5a60e4bd118cdfda Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 28 Jan 2022 11:05:11 -0500 Subject: [PATCH 14/23] FindMin needs to ensure iOut < jOut --- src/Cluster/DynamicMatrix.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Cluster/DynamicMatrix.cpp b/src/Cluster/DynamicMatrix.cpp index 788f045359..022890980a 100644 --- a/src/Cluster/DynamicMatrix.cpp +++ b/src/Cluster/DynamicMatrix.cpp @@ -47,6 +47,7 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) { } #else double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { +/* float min = std::numeric_limits::max(); for (unsigned int row = 0; row != Mat_.Nrows(); row++) { if (!ignore_[row]) { @@ -61,7 +62,7 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { } } } - mprintf("DEBUG: Min found at row %i col %i (%f)\n", iOut, jOut, min); + mprintf("DEBUG: Min found at row %i col %i (%f)\n", iOut, jOut, min);*/ float currentMin = std::numeric_limits::max(); @@ -79,6 +80,16 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { } } } + // Ensure iOut < jOut + if (minRow < minCol) { + iOut = minRow; + jOut = minCol; + } else { + iOut = minCol; + jOut = minRow; + } + return (double)currentMin; +/* if (minCol < minRow) { //FIXME i think only needed for testing int itmp = minCol; minCol = minRow; @@ -89,7 +100,7 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { if (iOut != minRow || jOut != minCol) mprintf("Error: Column/row mismatch.\n"); - return (double)min; + return (double)min;*/ } #endif From f2be4d35ce024bb705eb4af71bdad00df247989f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 28 Jan 2022 11:10:52 -0500 Subject: [PATCH 15/23] Disable some debug info --- src/Cluster/Algorithm_HierAgglo.cpp | 14 +++++++++----- src/Cluster/DynamicMatrix.cpp | 5 +++-- src/Cluster/DynamicMatrix.h | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/Cluster/Algorithm_HierAgglo.cpp b/src/Cluster/Algorithm_HierAgglo.cpp index b247f4285e..3e86d1597e 100644 --- a/src/Cluster/Algorithm_HierAgglo.cpp +++ b/src/Cluster/Algorithm_HierAgglo.cpp @@ -133,8 +133,10 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters, } } // DEBUG print closest indices - ClusterDistances_.PrintElementsSquare(); - ClusterDistances_.PrintClosest(); + if (debug_ > 0) { + if (debug_ > 1) ClusterDistances_.PrintElementsSquare(); + ClusterDistances_.PrintClosest(); + } //InitializeClusterDistances(); // DEBUG - print initial clusters if (debug_ > 1) @@ -151,10 +153,12 @@ int Cpptraj::Cluster::Algorithm_HierAgglo::DoClustering(List& clusters, break; } if (clusters.Nclusters() == 1) clusteringComplete = true; // Sanity check -// cluster_progress.Update( iterations++ ); // FIXME reenable + cluster_progress.Update( iterations++ ); // DEBUG print closest indices - ClusterDistances_.PrintElementsSquare(); - ClusterDistances_.PrintClosest(); + if (debug_ > 0) { + if (debug_ > 1) ClusterDistances_.PrintElementsSquare(); + ClusterDistances_.PrintClosest(); + } } mprintf("\tCompleted after %i iterations, %u clusters.\n",iterations, clusters.Nclusters()); diff --git a/src/Cluster/DynamicMatrix.cpp b/src/Cluster/DynamicMatrix.cpp index 022890980a..c445a39e56 100644 --- a/src/Cluster/DynamicMatrix.cpp +++ b/src/Cluster/DynamicMatrix.cpp @@ -104,6 +104,7 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { } #endif +/** For DEBUG. Print all non-ignored matrix elements. */ void Cpptraj::Cluster::DynamicMatrix::PrintElements() const { unsigned int iVal = 0; unsigned int jVal = 1; @@ -119,7 +120,7 @@ void Cpptraj::Cluster::DynamicMatrix::PrintElements() const { } } -/** Print elements like a square. */ +/** For DEBUG. Print elements like a square. */ void Cpptraj::Cluster::DynamicMatrix::PrintElementsSquare() const { /* mprintf("%4s", ""); for (unsigned int col = 0; col < Mat_.Ncols(); col++) @@ -149,7 +150,7 @@ void Cpptraj::Cluster::DynamicMatrix::PrintElementsSquare() const { } } -/** Print closest indices to STDOUT. */ +/** For DEBUG. Print closest indices to STDOUT. */ void Cpptraj::Cluster::DynamicMatrix::PrintClosest() const { for (unsigned int idx = 0; idx != closestIdx_.size(); idx++) { if (!ignore_[idx]) { diff --git a/src/Cluster/DynamicMatrix.h b/src/Cluster/DynamicMatrix.h index 45244cf9a6..e65c3a237c 100644 --- a/src/Cluster/DynamicMatrix.h +++ b/src/Cluster/DynamicMatrix.h @@ -2,7 +2,7 @@ #define INC_CLUSTER_DYNAMICMATRIX_H #include #include "../Matrix.h" -#include "../CpptrajStdio.h" // DEBUG +//#incl ude "../CpptrajStdio.h" // DEBUG namespace Cpptraj { namespace Cluster { @@ -69,25 +69,25 @@ void DynamicMatrix::updateClosestIdx(unsigned int idx) { } } } // END loop jdx - mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]); +// mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]); } /** Set cluster distance. Record closest distance for each col. */ void DynamicMatrix::SetCdist(int col, int row, float val) { - mprintf("DEBUG: Setting cluster %i to cluster %i distance= %f\n", col, row, val); +// mprintf("DEBUG: Setting cluster %i to cluster %i distance= %f\n", col, row, val); int update_col = -1; int update_row = -1; // Check col->row distance if (closestIdx_[col] < 0) { - mprintf("DEBUG: This is the initial distance for col.\n"); +// mprintf("DEBUG: This is the initial distance for col.\n"); closestIdx_[col] = row; } else { float closest_dist_to_col = Mat_.element(col, closestIdx_[col]); - mprintf("DEBUG: Current closest to %i is %i (%f)\n", col, closestIdx_[col], closest_dist_to_col); +// mprintf("DEBUG: Current closest to %i is %i (%f)\n", col, closestIdx_[col], closest_dist_to_col); if (val < closest_dist_to_col) { - mprintf("DEBUG: Updated col.\n"); +// mprintf("DEBUG: Updated col.\n"); closestIdx_[col] = row; } else if (row == closestIdx_[col] && val > closest_dist_to_col) { // We are increasing what was previously the closest distance. May need to find new closest distance to col. @@ -97,13 +97,13 @@ void DynamicMatrix::SetCdist(int col, int row, float val) { // Check row->col distance if (closestIdx_[row] < 0) { - mprintf("DEBUG: This is the initial distance for row.\n"); +// mprintf("DEBUG: This is the initial distance for row.\n"); closestIdx_[row] = col; } else { float closest_dist_to_row = Mat_.element(row, closestIdx_[row]); - mprintf("DEBUG: Current closest to %i is %i (%f)\n", row, closestIdx_[row], closest_dist_to_row); +// mprintf("DEBUG: Current closest to %i is %i (%f)\n", row, closestIdx_[row], closest_dist_to_row); if (val < closest_dist_to_row) { - mprintf("DEBUG: Updated row.\n"); +// mprintf("DEBUG: Updated row.\n"); closestIdx_[row] = col; } else if (col == closestIdx_[row] && val > closest_dist_to_row) { // We are increasing what was previously the closest distance. May need to find new closest distance to row. @@ -116,11 +116,11 @@ void DynamicMatrix::SetCdist(int col, int row, float val) { // Update closest if necessary if (update_col != -1) { - mprintf("DEBUG: Closest to column %i increased. Need to update.\n", update_col); +// mprintf("DEBUG: Closest to column %i increased. Need to update.\n", update_col); updateClosestIdx(update_col); } if (update_row != -1) { - mprintf("DEBUG: Closest to row %i increased. Need to update.\n", update_row); +// mprintf("DEBUG: Closest to row %i increased. Need to update.\n", update_row); updateClosestIdx(update_row); } } @@ -131,7 +131,7 @@ void DynamicMatrix::Ignore(int row) { // If any remaining indices have the ignored row that needs to be updated for (unsigned int idx = 0; idx != closestIdx_.size(); idx++) { if (!ignore_[idx] && closestIdx_[idx] == row) { - mprintf("DEBUG: closest to %u was just ignored; must be updated.\n", idx); +// mprintf("DEBUG: closest to %u was just ignored; must be updated.\n", idx); updateClosestIdx(idx); /* closestIdx_[idx] = -1; float current_closest = 0; From 44073b6070a3d5cade91fac87ef8d31070b8546f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 28 Jan 2022 12:33:36 -0500 Subject: [PATCH 16/23] Bench hieragglo --- devtools/benchmark/cluster/cluster.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/devtools/benchmark/cluster/cluster.sh b/devtools/benchmark/cluster/cluster.sh index 8b8a9d9c5d..5ce6d4b337 100755 --- a/devtools/benchmark/cluster/cluster.sh +++ b/devtools/benchmark/cluster/cluster.sh @@ -54,10 +54,11 @@ EOF ((SPLIT3 = $NFRAMES * 3)) echo "Total $TOTAL, Each section $NFRAMES splits $SPLIT1 $SPLIT2 $SPLIT3" +# kmeans clusters 5 randompoint kseed 42 \ cat >> cpptraj.in < Date: Fri, 28 Jan 2022 12:45:08 -0500 Subject: [PATCH 17/23] Remove old code and openmp code. Finding min is no longer the bottleneck --- src/Cluster/DynamicMatrix.cpp | 84 ----------------------------------- src/Cluster/DynamicMatrix.h | 69 +++++++++------------------- 2 files changed, 20 insertions(+), 133 deletions(-) diff --git a/src/Cluster/DynamicMatrix.cpp b/src/Cluster/DynamicMatrix.cpp index c445a39e56..6de54041bb 100644 --- a/src/Cluster/DynamicMatrix.cpp +++ b/src/Cluster/DynamicMatrix.cpp @@ -1,70 +1,10 @@ #include // float max -#ifdef _OPENMP -#include -#endif #include "DynamicMatrix.h" #include "../CpptrajStdio.h" // DynamicMatrix::FindMin() /** Find the minimum not being ignored; set corresponding row and column. */ -#ifdef _OPENMP -double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) { - int row, mythread; - int nrows = (int)Mat_.Nrows(); -# pragma omp parallel private(row, mythread) - { - mythread = omp_get_thread_num(); - - minVal_[mythread] = std::numeric_limits::max(); -# pragma omp for schedule(dynamic) - for (row = 0; row < nrows; row++) { - if (!ignore_[row]) { - int col = row + 1; - unsigned int idx = Mat_.CalcIndex(col, row); /// idx is start of this row - for (; col != nrows; col++, idx++) { - if (!ignore_[col]) { - if (Mat_[idx] < minVal_[mythread]) { - minVal_[mythread] = Mat_[idx]; - minRow_[mythread] = row; - minCol_[mythread] = col; - } - } - } - } - } - } /* END pragma omp parallel */ - float min = minVal_[0]; - iOut = minRow_[0]; - jOut = minCol_[0]; - for (unsigned int idx = 1; idx != minVal_.size(); idx++) { - if (minVal_[idx] < min) { - min = minVal_[idx]; - iOut = minRow_[idx]; - jOut = minCol_[idx]; - } - } - return (double)min; -} -#else double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { -/* - float min = std::numeric_limits::max(); - for (unsigned int row = 0; row != Mat_.Nrows(); row++) { - if (!ignore_[row]) { - unsigned int col = row + 1; - unsigned int idx = Mat_.CalcIndex(col, row); // idx is start of this row - for (; col != Mat_.Ncols(); col++, idx++) { - if (!ignore_[col] && Mat_[idx] < min) { - min = Mat_[idx]; - iOut = (int)row; - jOut = (int)col; - } - } - } - } - mprintf("DEBUG: Min found at row %i col %i (%f)\n", iOut, jOut, min);*/ - - float currentMin = std::numeric_limits::max(); int minRow = -1; int minCol = -1; @@ -89,20 +29,7 @@ double Cpptraj::Cluster::DynamicMatrix::FindMin(int& iOut, int& jOut) const { jOut = minRow; } return (double)currentMin; -/* - if (minCol < minRow) { //FIXME i think only needed for testing - int itmp = minCol; - minCol = minRow; - minRow = itmp; - } - mprintf("DEBUG: Second try: Min found at row %i col %i (%f)\n", minRow, minCol, currentMin); - - if (iOut != minRow || jOut != minCol) - mprintf("Error: Column/row mismatch.\n"); - - return (double)min;*/ } -#endif /** For DEBUG. Print all non-ignored matrix elements. */ void Cpptraj::Cluster::DynamicMatrix::PrintElements() const { @@ -182,16 +109,5 @@ int Cpptraj::Cluster::DynamicMatrix::SetupMatrix(size_t sizeIn) { if (Mat_.resize( 0L, sizeIn )) return 1; ignore_.assign( sizeIn, false ); closestIdx_.assign( sizeIn, -1 ); -# ifdef _OPENMP - int n_threads = 0; -# pragma omp parallel - { - if (omp_get_thread_num() == 0) - n_threads = omp_get_num_threads(); - } - minRow_.resize( n_threads ); - minCol_.resize( n_threads ); - minVal_.resize( n_threads ); -# endif return 0; } diff --git a/src/Cluster/DynamicMatrix.h b/src/Cluster/DynamicMatrix.h index e65c3a237c..7f495ad63e 100644 --- a/src/Cluster/DynamicMatrix.h +++ b/src/Cluster/DynamicMatrix.h @@ -11,18 +11,13 @@ class DynamicMatrix { public: DynamicMatrix() {} /// Indicate given row/col should be ignored. - //void Ignore(int row) { ignore_[row] = true; } inline void Ignore(int); /// \return true if given row/col has been ignored. bool IgnoringRow(int row) const { return ignore_[row]; } /// \return Original number of rows in matrix size_t Nrows() const { return Mat_.Nrows(); } /// Set the row and column of the smallest element not being ignored. -# ifdef _OPENMP - double FindMin(int&, int&); -# else double FindMin(int&, int&) const; -# endif /// \return an element. inline double GetCdist(int c, int r) const { return Mat_.element(c,r); } /// Print all matrix elements to STDOUT @@ -31,45 +26,38 @@ class DynamicMatrix { void PrintElementsSquare() const; /// Print closest index to each col to STDOUT void PrintClosest() const; - /// Add given element to matrix. - //int AddCdist(double d) { return Mat_.addElement((float)d); } /// Set element at column/row to given value - //void SetCdist(int col, int row, double val) { Mat_.setElement(col, row, val); } inline void SetCdist(int, int, float); /// Set up matrix for given number of rows int SetupMatrix(size_t); private: + /// Find the closest cluster to the given cluster inline void updateClosestIdx(unsigned int); - Matrix Mat_; ///< Upper-triangle matrix holding cluster distances. - std::vector ignore_; ///< If true, ignore the row/col when printing/searching etc. + Matrix Mat_; ///< Upper-triangle matrix holding cluster distances. + std::vector ignore_; ///< If true, ignore the row/col when printing/searching etc. std::vector closestIdx_; ///< For each cluster, index of the cluster closest to it -# ifdef _OPENMP - std::vector minRow_; - std::vector minCol_; - std::vector minVal_; -# endif }; -/** Update closest to idx */ +/** Update closest to given idx */ void DynamicMatrix::updateClosestIdx(unsigned int idx) { - closestIdx_[idx] = -1; - float current_closest = 0; - for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { - if (!ignore_[jdx] && idx != jdx) { - if (closestIdx_[idx] == -1) { - closestIdx_[idx] = jdx; - current_closest = Mat_.element(idx, jdx); - } else { - float fdist = Mat_.element(idx, jdx); - if (fdist < current_closest) { - closestIdx_[idx] = jdx; - current_closest = fdist; - } - } + closestIdx_[idx] = -1; + float current_closest = 0; + for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { + if (!ignore_[jdx] && idx != jdx) { + if (closestIdx_[idx] == -1) { + closestIdx_[idx] = jdx; + current_closest = Mat_.element(idx, jdx); + } else { + float fdist = Mat_.element(idx, jdx); + if (fdist < current_closest) { + closestIdx_[idx] = jdx; + current_closest = fdist; } - } // END loop jdx -// mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]); + } + } + } // END loop jdx +//mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]); } /** Set cluster distance. Record closest distance for each col. */ @@ -133,23 +121,6 @@ void DynamicMatrix::Ignore(int row) { if (!ignore_[idx] && closestIdx_[idx] == row) { // mprintf("DEBUG: closest to %u was just ignored; must be updated.\n", idx); updateClosestIdx(idx); -/* closestIdx_[idx] = -1; - float current_closest = 0; - for (unsigned int jdx = 0; jdx != closestIdx_.size(); jdx++) { - if (!ignore_[jdx] && idx != jdx) { - if (closestIdx_[idx] == -1) { - closestIdx_[idx] = jdx; - current_closest = Mat_.element(idx, jdx); - } else { - float fdist = Mat_.element(idx, jdx); - if (fdist < current_closest) { - closestIdx_[idx] = jdx; - current_closest = fdist; - } - } - } - } // END loop jdx - mprintf("DEBUG: New closest to %u is %i\n", idx, closestIdx_[idx]);*/ } } // END loop idx } From f411deaa5136f26aaaa8622c2c206d48a52131da Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Fri, 28 Jan 2022 15:38:32 -0500 Subject: [PATCH 18/23] Check openmp --- devtools/benchmark/cluster/cluster.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devtools/benchmark/cluster/cluster.sh b/devtools/benchmark/cluster/cluster.sh index 5ce6d4b337..25e01accf3 100755 --- a/devtools/benchmark/cluster/cluster.sh +++ b/devtools/benchmark/cluster/cluster.sh @@ -1,9 +1,9 @@ #!/bin/bash if [ -z "$CPPTRAJ" ] ; then - CPPTRAJ=`which cpptraj` - #export OMP_NUM_THREADS=12 - #CPPTRAJ=`which cpptraj.OMP` + #CPPTRAJ=`which cpptraj` + export OMP_NUM_THREADS=4 + CPPTRAJ=`which cpptraj.OMP` fi TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7 From abb8c015760d3d88d976dc152d92b02429ce8d9e Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 31 Jan 2022 09:58:41 -0500 Subject: [PATCH 19/23] Remove debug info --- src/Cluster/Control.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index 95330438fc..b8c4d84926 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -821,7 +821,6 @@ int Cpptraj::Cluster::Control::Run() { // Clustering metrics. Centroids should be up to date. DBITotal_ = ComputeDBI(clusters_, averageDist_, metrics_); - mprintf("DEBUG: DBI %f\n", DBITotal_); dbi_set_->Add(0, &DBITotal_); if (clusters_.Nclusters() > 1) { pseudoF_ = ComputePseudoF(clusters_, SSRSST_, metrics_, debug_); From 33399a4c458b8aecf4dcbbbc56da436f66a2d88c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 31 Jan 2022 10:13:28 -0500 Subject: [PATCH 20/23] Update copyright year. Add Franz and Klaus to contributor list. --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4bcac2c0fc..b2f55e0c39 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ CPPTRAJ see the following publication: Disclaimer and Copyright ======================== -CPPTRAJ is Copyright (c) 2010-2021 Daniel R. Roe. +CPPTRAJ is Copyright (c) 2010-2022 Daniel R. Roe. The terms for using, copying, modifying, and distributing CPPTRAJ are specified in the file LICENSE. @@ -202,6 +202,9 @@ Fixes and improvements to nucleic acid dihedral angle definitions (DihedralSearc * David S. Cerutti (Rutgers University, Piscataway, NJ, USA) Original code for the 'xtalsymm' Action. +* Franz Waibl & Klaus R. Liedl (Department of General, Inorganic, and Theoretical Chemistry, University of Innsbruck) +Improvements and enhancements for GIST. + #### Various Contributions * David A. Case (Rutgers University, Piscataway, NJ, USA) * Hai Nguyen (Rutgers University, Piscataway, NJ, USA) From 53a7926502249ef6991bd47f0df9b0b5e3969ae2 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 31 Jan 2022 10:14:02 -0500 Subject: [PATCH 21/23] Version 6.2.3. Revision bump for HA cluster speed improvement and DBI/pSF/SSRSST DataSet output. --- src/Version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Version.h b/src/Version.h index 7ca66d77d1..e7e284e3bc 100644 --- a/src/Version.h +++ b/src/Version.h @@ -12,7 +12,7 @@ * Whenever a number that precedes is incremented, all subsequent * numbers should be reset to 0. */ -#define CPPTRAJ_INTERNAL_VERSION "V6.2.2" +#define CPPTRAJ_INTERNAL_VERSION "V6.2.3" /// PYTRAJ relies on this #define CPPTRAJ_VERSION_STRING CPPTRAJ_INTERNAL_VERSION #endif From 5946e22326c5963fcd104559567d5643a4a64bf1 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 31 Jan 2022 12:52:59 -0500 Subject: [PATCH 22/23] Test cluster metrics write out --- test/Test_Cluster/RunTest.sh | 6 +++++- test/Test_Cluster/c1.cmetrics.dat.save | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 test/Test_Cluster/c1.cmetrics.dat.save diff --git a/test/Test_Cluster/RunTest.sh b/test/Test_Cluster/RunTest.sh index d422b8cf34..bf6ea5bfd5 100755 --- a/test/Test_Cluster/RunTest.sh +++ b/test/Test_Cluster/RunTest.sh @@ -7,7 +7,8 @@ CleanFiles cluster.in cnumvtime.dat avg.summary.dat summary.dat CpptrajPairDist \ cpop.agr summary2.dat Cmatrix.nccmatrix Cmatrix.cmatrix summary3.dat \ normpop.agr normframe.agr cascii.dat.save cascii.dat pw.dat \ - cinfo.dat mysil.cluster.dat mysil.frame.dat cascii?.info + cinfo.dat mysil.cluster.dat mysil.frame.dat cascii?.info \ + c1.cmetrics.dat TESTNAME='Hierarchical agglomerative clustering tests' #Requires netcdf @@ -19,6 +20,8 @@ parm ../tz2.parm7 trajin ../tz2.crd cluster C1 :2-10 clusters 3 epsilon 4.0 out cnumvtime.dat info cinfo.dat sil mysil summary avg.summary.dat nofit savepairdist cpopvtime cpop.agr pairdist Cmatrix.cmatrix cluster crd1 :2-10 clusters 3 epsilon 4.0 summary summary.dat complete nofit loadpairdist pairdist Cmatrix.cmatrix +run +writedata c1.cmetrics.dat C1[DBI] C1[PSF] C1[SSRSST] EOF RunCpptraj "Cluster command test, in-memory pairwise distances." DoTest cnumvtime.dat.save cnumvtime.dat @@ -26,6 +29,7 @@ DoTest cinfo.dat.save cinfo.dat DoTest mysil.cluster.dat.save mysil.cluster.dat DoTest mysil.frame.dat.save mysil.frame.dat DoTest avg.summary.dat.save avg.summary.dat +DoTest c1.cmetrics.dat.save c1.cmetrics.dat DoTest summary.dat.save summary.dat DoTest cpop.agr.save cpop.agr diff --git a/test/Test_Cluster/c1.cmetrics.dat.save b/test/Test_Cluster/c1.cmetrics.dat.save new file mode 100644 index 0000000000..7b25ab2b68 --- /dev/null +++ b/test/Test_Cluster/c1.cmetrics.dat.save @@ -0,0 +1,2 @@ +#Frame C1[DBI] C1[PSF] C1[SSRSST] + 1 1.0542 31.6034 0.7332 From c58e00336c2ce75478a63a65ecf759a9aa8af1bf Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 31 Jan 2022 12:53:11 -0500 Subject: [PATCH 23/23] Allocate cluster metric sets last to try to fix pytraj --- src/Cluster/Control.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Cluster/Control.cpp b/src/Cluster/Control.cpp index b8c4d84926..a2957c7eb2 100644 --- a/src/Cluster/Control.cpp +++ b/src/Cluster/Control.cpp @@ -491,14 +491,6 @@ int Cpptraj::Cluster::Control::SetupClustering(DataSetList const& setsToCluster, if (cnumvtime_ == 0) return 1; if (cnumvtimefile != 0) cnumvtimefile->AddDataSet( cnumvtime_ ); - // DBI and pSF data sets - dbi_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "DBI")); - if (dbi_set_ == 0) return 1; - psf_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "PSF")); - if (psf_set_ == 0) return 1; - ssrsst_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "SSRSST")); - if (ssrsst_set_ == 0) return 1; - // If cache was allocated, add to the DataSetList so it is after cnumvtime for pytraj if (metrics_.CacheWasAllocated()) DSL.AddSet( metrics_.CachePtr() ); @@ -514,6 +506,14 @@ int Cpptraj::Cluster::Control::SetupClustering(DataSetList const& setsToCluster, clustersvtimefile->AddDataSet( clustersVtime_ ); } + // DBI and pSF data sets + dbi_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "DBI")); + if (dbi_set_ == 0) return 1; + psf_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "PSF")); + if (psf_set_ == 0) return 1; + ssrsst_set_ = DSL.AddSet(DataSet::DOUBLE, MetaData(dsname_, "SSRSST")); + if (ssrsst_set_ == 0) return 1; + return 0; }