diff --git a/.travis.yml b/.travis.yml
index d63a5af3f34..23a7a93bb20 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -40,7 +40,7 @@ before_install:
- conda config --add channels MDAnalysis
- conda update --yes conda
install:
- - if [[ $SETUP == 'full' ]]; then conda create --yes -q -n pyenv python=$PYTHON_VERSION numpy scipy nose=1.3.7 sphinx=1.3 griddataformats six; fi
+ - if [[ $SETUP == 'full' ]]; then conda create --yes -q -n pyenv python=$PYTHON_VERSION numpy scipy nose=1.3.7 sphinx=1.3 griddataformats six scikit-learn; fi
- if [[ $SETUP == 'minimal' ]]; then conda create --yes -q -n pyenv python=$PYTHON_VERSION numpy nose=1.3.7 sphinx=1.3 griddataformats six; fi
- source activate pyenv
- |
diff --git a/package/MDAnalysis/analysis/encore/__init__.py b/package/MDAnalysis/analysis/encore/__init__.py
new file mode 100644
index 00000000000..33ff62513d5
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/__init__.py
@@ -0,0 +1,36 @@
+# __init__.py
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+from __future__ import absolute_import
+
+__all__ = [
+ 'covariance',
+ 'similarity',
+ 'confdistmatrix',
+ 'clustering'
+]
+
+from .similarity import hes, ces, dres, \
+ ces_convergence, dres_convergence
+
+from .clustering.ClusterCollection import ClusterCollection, Cluster
+from .clustering.ClusteringMethod import *
+from .clustering.cluster import cluster
+from .dimensionality_reduction.DimensionalityReductionMethod import *
+from .dimensionality_reduction.reduce_dimensionality import (
+ reduce_dimensionality)
+from .confdistmatrix import get_distance_matrix
+from .utils import merge_universes
diff --git a/package/MDAnalysis/analysis/encore/bootstrap.py b/package/MDAnalysis/analysis/encore/bootstrap.py
new file mode 100644
index 00000000000..91841f76dde
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/bootstrap.py
@@ -0,0 +1,153 @@
+# bootstrap.py --- Bootstrap analysis for ensembles and distance matrices
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+bootstrap procedures --- :mod:`MDAnalysis.analysis.ensemble.bootstrap`
+=====================================================================
+
+
+The module contains functions for bootstrapping either ensembles (Universe
+objects) or distance matrices, by resampling with replacement.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import numpy as np
+import logging
+import MDAnalysis as mda
+from .utils import TriangularMatrix, ParallelCalculation
+
+
+def bootstrapped_matrix(matrix, ensemble_assignment):
+ """
+ Bootstrap an input square matrix. The resulting matrix will have the same
+ shape as the original one, but the order of its elements will be drawn
+ (with repetition). Separately bootstraps each ensemble.
+
+ Parameters
+ ----------
+
+ matrix : encore.utils.TriangularMatrix
+ similarity/dissimilarity matrix
+
+ ensemble_assignment: numpy.array
+ array of ensemble assignments. This array must be matrix.size long.
+
+ Returns
+ -------
+
+ this_m : encore.utils.TriangularMatrix
+ bootstrapped similarity/dissimilarity matrix
+ """
+ ensemble_identifiers = np.unique(ensemble_assignment)
+ this_m = TriangularMatrix(size=matrix.size)
+ indexes = []
+ for ens in ensemble_identifiers:
+ old_indexes = np.where(ensemble_assignment == ens)[0]
+ indexes.append(np.random.randint(low=np.min(old_indexes),
+ high=np.max(old_indexes) + 1,
+ size=old_indexes.shape[0]))
+
+ indexes = np.hstack(indexes)
+ for j in range(this_m.size):
+ for k in range(j):
+ this_m[j, k] = matrix[indexes[j], indexes[k]]
+
+ logging.info("Matrix bootstrapped.")
+ return this_m
+
+
+def get_distance_matrix_bootstrap_samples(distance_matrix,
+ ensemble_assignment,
+ samples=100,
+ ncores=1):
+ """
+ Calculates distance matrices corresponding to bootstrapped ensembles, by
+ resampling with replacement.
+
+ Parameters
+ ----------
+
+ distance_matrix : encore.utils.TriangularMatrix
+ Conformational distance matrix
+
+ ensemble_assignment : str
+ Mapping from frames to which ensemble they are from (necessary because
+ ensembles are bootstrapped independently)
+
+ samples : int, optional
+ How many bootstrap samples to create.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1)
+
+ Returns
+ -------
+
+ confdistmatrix : list of encore.utils.TriangularMatrix
+ """
+
+ bs_args = \
+ [([distance_matrix, ensemble_assignment]) for i in range(samples)]
+
+ pc = ParallelCalculation(ncores, bootstrapped_matrix, bs_args)
+
+ pc_results = pc.run()
+
+ bootstrap_matrices = zip(*pc_results)[1]
+
+ return bootstrap_matrices
+
+
+def get_ensemble_bootstrap_samples(ensemble,
+ samples=100):
+ """
+ Generates a bootstrapped ensemble by resampling with replacement.
+
+ Parameters
+ ----------
+
+ ensemble : MDAnalysis.Universe
+ Conformational distance matrix
+
+ samples : int, optional
+ How many bootstrap samples to create.
+
+ Returns
+ -------
+
+ list of MDAnalysis.Universe objects
+ """
+
+ ensemble.transfer_to_memory()
+
+ ensembles = []
+ for i in range(samples):
+ indices = np.random.randint(
+ low=0,
+ high=ensemble.trajectory.timeseries().shape[1],
+ size=ensemble.trajectory.timeseries().shape[1])
+ ensembles.append(
+ mda.Universe(ensemble.filename,
+ ensemble.trajectory.timeseries(format='afc')[:,indices,:],
+ format=mda.coordinates.memory.MemoryReader))
+ return ensembles
\ No newline at end of file
diff --git a/package/MDAnalysis/analysis/encore/clustering/ClusterCollection.py b/package/MDAnalysis/analysis/encore/clustering/ClusterCollection.py
new file mode 100644
index 00000000000..4e2f3c00271
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/ClusterCollection.py
@@ -0,0 +1,252 @@
+# Cluster.py --- classes to handle results of clustering runs
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+Cluster representation --- :mod:`MDAnalysis.analysis.encore.clustering.ClusterCollection`
+=====================================================================
+
+The module contains the Cluster and ClusterCollection classes which are
+designed to store results from clustering algorithms.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import numpy as np
+import six
+
+
+class Cluster(object):
+ """
+ Generic Cluster class for clusters with centroids.
+
+ Attributes
+ ----------
+
+ id : int
+ Cluster ID number. Useful for the ClustersCollection class
+
+ metadata : iterable
+ dict of lists or numpy.array, containing metadata for the cluster
+ elements. The iterable must return the same number of elements as
+ those that belong to the cluster.
+
+ size : int
+ number of elements.
+
+ centroid : element object
+ cluster centroid.
+
+ elements : numpy.array
+ array containing the cluster elements.
+ """
+
+ def __init__(self, elem_list=None, centroid=None, idn=None, metadata=None):
+ """Class constructor. If elem_list is None, an empty cluster is created
+ and the remaining arguments ignored.
+
+ Parameters
+ ----------
+
+ elem_list : numpy.array or None
+ numpy array of cluster elements
+
+ centroid : None or element object
+ centroid
+
+ idn : int
+ cluster ID
+
+ metadata : iterable
+ metadata, one value for each cluster element. The iterable
+ must have the same length as the elements array.
+
+ """
+
+ self.id = idn
+
+ if elem_list is None:
+ self.size = 0
+ self.elements = np.array([])
+ self.centroid = None
+ self.metadata = {}
+ return
+
+ self.metadata = {}
+ self.elements = elem_list
+ if centroid not in self.elements:
+ raise LookupError("Centroid of cluster not found in the element list")
+
+ self.centroid = centroid
+ self.size = self.elements.shape[0]
+ if metadata:
+ for name, data in six.iteritems(metadata):
+ if len(data) != self.size:
+ raise TypeError("Size of metadata having label \"{0}\"\
+is not equal to the number of cluster elmements".format(name))
+ self.add_metadata(name, data)
+
+ def __iter__(self):
+ """
+ Iterate over elements in cluster
+ """
+ return iter(self.elements)
+
+ def __len__(self):
+ """
+ Size of cluster
+ """
+ return len(self.elements)
+
+ def add_metadata(self, name, data):
+ if len(data) != self.size:
+ raise TypeError("Size of metadata is not equal to the number of\
+ cluster elmements")
+ self.metadata[name] = np.array(data)
+
+ def __repr__(self):
+ """
+ Textual representation
+ """
+ out = repr(self.elements)
+ return out
+
+
+class ClusterCollection(object):
+ """Clusters collection class; this class represents the results of a full
+ clustering run. It stores a group of clusters defined as
+ encore.clustering.Cluster objects.
+
+ Attributes
+ ----------
+
+ clusters : list
+ list of of Cluster objects which are part of the Cluster collection
+
+"""
+
+ def __init__(self, elements=None, metadata=None):
+ """Class constructor. If elements is None, an empty cluster collection
+ will be created. Otherwise, the constructor takes as input an
+ iterable of ints, for instance:
+
+ [ a, a, a, a, b, b, b, c, c, ... , z, z ]
+
+ the variables a,b,c,...,z are cluster centroids, here as cluster
+ element numbers (i.e. 3 means the 4th element of the ordered input
+ for clustering). The array maps a correspondence between
+ cluster elements (which are implicitly associated with the
+ position in the array) with centroids, i. e. defines clusters.
+ For instance:
+
+ [ 1, 1, 1, 4, 4, 5 ]
+
+ means that elements 0, 1, 2 form a cluster which has 1 as centroid,
+ elements 3 and 4 form a cluster which has 4 as centroid, and
+ element 5 has its own cluster.
+
+
+ Arguments
+ ---------
+
+ elements : iterable of ints or None
+ clustering results. See the previous description for details
+
+ metadata : {str:list, str:list,...} or None
+ metadata for the data elements. The list must be of the same
+ size as the elements array, with one value per element.
+
+ """
+ idn = 0
+ if elements is None:
+ self.clusters = None
+ return
+
+ if not len(set(map(type, elements))) == 1:
+ raise TypeError("all the elements must have the same type")
+ self.clusters = []
+ elements_array = np.array(elements)
+ centroids = np.unique(elements_array)
+ for i in centroids:
+ if elements[i] != i:
+ raise ValueError("element {0}, which is a centroid, doesn't \
+belong to its own cluster".format(elements[i]))
+ for c in centroids:
+ this_metadata = {}
+ this_array = np.where(elements_array == c)
+ if metadata:
+ for k, v in six.iteritems(metadata):
+ this_metadata[k] = np.asarray(v)[this_array]
+ self.clusters.append(
+ Cluster(elem_list=this_array[0], idn=idn, centroid=c,
+ metadata=this_metadata))
+
+ idn += 1
+
+ def get_ids(self):
+ """
+ Get the ID numbers of the clusters
+
+ Returns
+ -------
+
+ ids : list of int
+ list of cluster ids
+ """
+ return [v.idn for v in self.clusters]
+
+ def get_centroids(self):
+ """
+ Get the centroids of the clusters
+
+ Returns
+ -------
+
+ centroids : list of cluster element objects
+ list of cluster centroids
+ """
+
+ return [v.centroid for v in self.clusters]
+
+ def __iter__(self):
+ """
+ Iterate over clusters
+
+ """
+ return iter(self.clusters)
+
+ def __len__(self):
+ """
+ Length of clustering collection
+ """
+ return len(self.clusters)
+
+ def __repr__(self):
+ """
+ Textual representation
+ """
+ out = ""
+ for cluster in self.clusters:
+ out += "{0} (size:{1},centroid:{2}): {3}\n".format(cluster.id,
+ len(cluster),
+ cluster.centroid,
+ repr(cluster))
+ return out
diff --git a/package/MDAnalysis/analysis/encore/clustering/ClusteringMethod.py b/package/MDAnalysis/analysis/encore/clustering/ClusteringMethod.py
new file mode 100644
index 00000000000..300b91e1b50
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/ClusteringMethod.py
@@ -0,0 +1,425 @@
+# ClusteringMethod.py --- Interface classes to various clustering algorithms
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+clustering frontend --- :mod:`MDAnalysis.analysis.encore.clustering.ClusteringMethod`
+=====================================================================
+
+The module defines classes for interfacing to various clustering algorithms.
+One has been implemented natively, and will always be available, while
+others are available only if scikit-learn is installed
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import numpy as np
+import warnings
+import logging
+
+# Import native affinity propagation implementation
+from . import affinityprop
+
+# Attempt to import scikit-learn clustering algorithms
+try:
+ import sklearn.cluster
+except ImportError:
+ sklearn = None
+ msg = "sklearn.cluster could not be imported: some functionality will " \
+ "not be available in encore.fit_clusters()"
+ warnings.warn(msg, category=ImportWarning)
+ logging.warn(msg)
+ del msg
+
+
+def encode_centroid_info(clusters, cluster_centers_indices):
+ """
+ Adjust cluster indices to include centroid information
+ as described in documentation for ClusterCollection
+ """
+ values, indices = np.unique(clusters, return_inverse=True)
+ for c_center in cluster_centers_indices:
+ if clusters[c_center] != c_center:
+ values[indices[c_center]] = c_center
+ return values[indices]
+
+
+class ClusteringMethod (object):
+ """
+ Base class for any Clustering Method
+ """
+
+ # Whether the method accepts a distance matrix
+ accepts_distance_matrix=True
+
+ def __call__(self, x):
+ """
+ Parameters
+ ----------
+
+ x
+ either trajectory coordinate data (np.array) or an
+ encore.utils.TriangularMatrix, encoding the conformational
+ distance matrix
+
+ Returns
+ -------
+ numpy.array
+ list of cluster indices
+ """
+ raise NotImplementedError("Class {0} doesn't implement __call__()"
+ .format(self.__class__.__name__))
+
+
+class AffinityPropagationNative(ClusteringMethod):
+ """
+ Interface to the natively implemented Affinity propagation procedure.
+ """
+ def __init__(self,
+ damping=0.9, preference=-1.0,
+ max_iter=500, convergence_iter=50,
+ add_noise=True):
+ """
+ Parameters
+ ----------
+
+ damping : float, optional
+ Damping factor (default is 0.9). Parameter for the Affinity
+ Propagation for clustering.
+
+ preference : float, optional
+ Preference parameter used in the Affinity Propagation algorithm for
+ clustering (default -1.0). A high preference value results in
+ many clusters, a low preference will result in fewer numbers of
+ clusters.
+
+ max_iter : int, optional
+ Maximum number of iterations for affinity propagation (default is
+ 500).
+
+ convergence_iter : int, optional
+ Minimum number of unchanging iterations to achieve convergence
+ (default is 50). Parameter in the Affinity Propagation for
+ clustering.
+
+ add_noise : bool, optional
+ Apply noise to similarity matrix before running clustering
+ (default is True)
+
+ """
+ self.damping = damping
+ self.preference = preference
+ self.max_iter = max_iter
+ self.convergence_iter = convergence_iter
+ self.add_noise = add_noise
+
+ def __call__(self, distance_matrix):
+ """
+ Parameters
+ ----------
+
+ distance_matrix : encore.utils.TriangularMatrix
+ conformational distance matrix
+
+
+ Returns
+ -------
+ numpy.array
+ list of cluster indices
+ """
+ clusters = affinityprop.AffinityPropagation().run(
+ s=distance_matrix * -1., # invert sign
+ preference=self.preference,
+ lam=self.damping,
+ max_iterations = self.max_iter,
+ convergence = self.convergence_iter,
+ noise=int(self.add_noise))
+ details = {}
+ return clusters, details
+
+if sklearn:
+
+ class AffinityPropagation(ClusteringMethod):
+ """
+ Interface to the Affinity propagation clustering procedure implemented
+ in sklearn.
+ """
+
+ def __init__(self,
+ damping=0.9, preference=-1.0,
+ max_iter=500, convergence_iter=50,
+ **kwargs):
+ """
+ Parameters
+ ----------
+
+ damping : float, optional
+ Damping factor (default is 0.9). Parameter for the Affinity
+ Propagation for clustering.
+
+ preference : float, optional
+ Preference parameter used in the Affinity Propagation algorithm
+ for clustering (default -1.0). A high preference value results
+ in many clusters, a low preference will result in fewer numbers
+ of clusters.
+
+ max_iter : int, optional
+ Maximum number of iterations for affinity propagation (default
+ is 500).
+
+ convergence_iter : int, optional
+ Minimum number of unchanging iterations to achieve convergence
+ (default is 50). Parameter in the Affinity Propagation for
+ clustering.
+
+ """
+ self.ap = \
+ sklearn.cluster.AffinityPropagation(
+ damping=damping,
+ preference=preference,
+ max_iter=max_iter,
+ convergence_iter=convergence_iter,
+ affinity="precomputed",
+ **kwargs)
+
+ def __call__(self, distance_matrix):
+ """
+ Parameters
+ ----------
+
+ distance_matrix : encore.utils.TriangularMatrix
+ conformational distance matrix
+
+ Returns
+ -------
+ numpy.array
+ list of cluster indices
+
+ """
+ logging.info("Starting Affinity Propagation: {0}".format
+ (self.ap.get_params()))
+
+ # Convert from distance matrix to similarity matrix
+ similarity_matrix = distance_matrix.as_array() * -1
+ clusters = self.ap.fit_predict(similarity_matrix)
+ clusters = encode_centroid_info(clusters,
+ self.ap.cluster_centers_indices_)
+ details = {}
+ return clusters, details
+
+
+ class DBSCAN(ClusteringMethod):
+ """
+ Interface to the DBSCAN clustering procedure implemented in sklearn.
+ """
+ def __init__(self,
+ eps=0.5,
+ min_samples=5,
+ algorithm="auto",
+ leaf_size=30,
+ **kwargs):
+ """
+ Parameters
+ ----------
+
+ eps : float, optional (default = 0.5)
+ The maximum distance between two samples for them to be
+ considered as in the same neighborhood.
+
+ min_samples : int, optional (default = 5)
+ The number of samples (or total weight) in a neighborhood for
+ a point to be considered as a core point. This includes the
+ point itself.
+
+ algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+ The algorithm to be used by the NearestNeighbors module
+ to compute pointwise distances and find nearest neighbors.
+ See NearestNeighbors module documentation for details.
+
+ leaf_size : int, optional (default = 30)
+ Leaf size passed to BallTree or cKDTree. This can affect the
+ speed of the construction and query, as well as the memory
+ required to store the tree. The optimal value depends
+ on the nature of the problem.
+
+ sample_weight : array, shape (n_samples,), optional
+ Weight of each sample, such that a sample with a weight of at
+ least ``min_samples`` is by itself a core sample; a sample with
+ negative weight may inhibit its eps-neighbor from being core.
+ Note that weights are absolute, and default to 1.
+
+ """
+
+ self.dbscan = sklearn.cluster.DBSCAN(eps=eps,
+ min_samples = min_samples,
+ algorithm=algorithm,
+ leaf_size = leaf_size,
+ metric="precomputed",
+ **kwargs)
+
+ def __call__(self, distance_matrix):
+ """
+ Parameters
+ ----------
+
+ distance_matrix : encore.utils.TriangularMatrix
+ conformational distance matrix
+
+
+ Returns
+ -------
+ numpy.array
+ list of cluster indices
+
+ """
+ logging.info("Starting DBSCAN: {0}".format(
+ self.dbscan.get_params()))
+ clusters = self.dbscan.fit_predict(distance_matrix.as_array())
+ if np.min(clusters == -1):
+ clusters += 1
+ # No centroid information is provided by DBSCAN, so we just
+ # pick random members
+ cluster_representatives = np.unique(clusters, return_index=True)[1]
+ clusters = encode_centroid_info(clusters,
+ cluster_representatives)
+ details = {}
+ return clusters, details
+
+ class KMeans(ClusteringMethod):
+
+ # Whether the method accepts a distance matrix
+ accepts_distance_matrix = False
+
+ """
+ Interface to the KMeans clustering procedure implemented in sklearn.
+ """
+ def __init__(self,
+ n_clusters,
+ max_iter = 300,
+ n_init = 10,
+ init = 'k-means++',
+ algorithm="auto",
+ tol = 1e-4,
+ verbose=False,
+ random_state=None,
+ copy_x=True,
+ n_jobs=1,
+ **kwargs):
+ """
+ Parameters
+ ----------
+ n_clusters : int
+ The number of clusters to form as well as the number of
+ centroids to generate.
+
+ max_iter : int, optional (default 300)
+ Maximum number of iterations of the k-means algorithm to run.
+
+ n_init : int, optional (default 10)
+ Number of time the k-means algorithm will be run with different
+ centroid seeds. The final results will be the best output of
+ n_init consecutive runs in terms of inertia.
+
+ init : {'k-means++', 'random', or ndarray, or a callable}, optional
+ Method for initialization, default to 'k-means++':
+ 'k-means++' : selects initial cluster centers for k-mean
+ clustering in a smart way to speed up convergence. See section
+ Notes in k_init for more details.
+ 'random': generate k centroids from a Gaussian with mean and
+ variance estimated from the data.
+ If an ndarray is passed, it should be of shape
+ (n_clusters, n_features) and gives the initial centers.
+ If a callable is passed, it should take arguments X, k and
+ and a ranndom state and return an initialization.
+
+ precompute_distances : {'auto', True, False}
+ Precompute distances (faster but takes more memory).
+ 'auto' : do not precompute distances if
+ n_samples * n_clusters > 12 million. This corresponds to about
+ 100MB overhead per job using double precision.
+ True : always precompute distances
+ False : never precompute distances
+
+ tol : float, optional (default 1e-4)
+ The relative increment in the results before declaring
+ convergence.
+
+ verbose : boolean, optional (default False)
+ Verbosity mode.
+
+ random_state : integer or numpy.RandomState, optional
+ The generator used to initialize the centers. If an integer is
+ given, it fixes the seed. Defaults to the global numpy random
+ number generator.
+
+ copy_x : boolean, optional
+ When pre-computing distances it is more numerically accurate to
+ center the data first. If copy_x is True, then the original
+ data is not modified. If False, the original data is modified,
+ and put back before the function returns, but small numerical
+ differences may be introduced by subtracting and then adding
+ the data mean.
+
+ n_jobs : int
+ The number of jobs to use for the computation. This works by
+ computing each of the n_init runs in parallel. If -1 all CPUs
+ are used. If 1 is given, no parallel computing code is used at
+ all, which is useful for debugging. For n_jobs below -1,
+ (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs
+ but one are used.
+
+ """
+ self.kmeans = sklearn.cluster.KMeans(n_clusters = n_clusters,
+ max_iter = max_iter,
+ n_init = n_init,
+ init = init,
+ precompute_distances='auto',
+ tol = tol,
+ verbose=verbose,
+ random_state=random_state,
+ copy_x=copy_x,
+ n_jobs=n_jobs,
+ **kwargs)
+
+ def __call__(self, coordinates):
+ """
+ Parameters
+ ----------
+
+ coordinates : np.array
+ trajectory atom coordinates
+
+
+ Returns
+ -------
+ numpy.array
+ list of cluster indices
+ """
+ logging.info("Starting Kmeans: {0}".format(
+ (self.kmeans.get_params())))
+ clusters = self.kmeans.fit_predict(coordinates)
+ distances = self.kmeans.transform(coordinates)
+ cluster_center_indices = np.argmin(distances, axis=0)
+ clusters = encode_centroid_info(clusters,
+ cluster_center_indices)
+ details = {}
+ return clusters, details
+
diff --git a/package/MDAnalysis/analysis/encore/clustering/__init__.py b/package/MDAnalysis/analysis/encore/clustering/__init__.py
new file mode 100644
index 00000000000..f90ef9293dc
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/__init__.py
@@ -0,0 +1,10 @@
+from . import ClusteringMethod
+from . import ClusterCollection
+
+__all__ = [
+ 'ClusterCollection.ClusterCollection',
+ 'ClusterCollection.Cluster',
+ 'ClusteringMethod.AffinityPropagationNative'
+ 'ClusteringMethod.AffinityPropagation'
+ 'ClusteringMethod.DBSCAN']
+
diff --git a/package/MDAnalysis/analysis/encore/clustering/affinityprop.pyx b/package/MDAnalysis/analysis/encore/clustering/affinityprop.pyx
new file mode 100644
index 00000000000..80bbc53b2a5
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/affinityprop.pyx
@@ -0,0 +1,140 @@
+#cython embedsignature=True
+# affinityprop.pyx --- Cython wrapper for the affinity propagation C library
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+Cython wrapper for the C implementation of the Affinity Perturbation clustering algorithm.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+"""
+from ..utils import TriangularMatrix
+import logging
+import numpy
+cimport numpy
+cimport caffinityprop
+cimport cython
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+
+cdef class AffinityPropagation(object):
+ """
+ Affinity propagation clustering algorithm. This class is a Cython wrapper around the Affinity propagation algorithm, which is implement as a C library (see ap.c). The implemented algorithm is described in the paper:
+
+ Clustering by Passing Messages Between Data Points.
+ Brendan J. Frey and Delbert Dueck, University of Toronto
+ Science 315, 972–976, February 2007
+
+ """
+
+ def run(self, s, preference, float lam, int max_iterations, int convergence, int noise=1):
+ """
+ Run the clustering algorithm.
+
+ Parameters:
+ ---------
+
+ s : encore.utils.TriangularMatrix object
+ Triangular matrix containing the similarity values for each pair of clustering elements. Notice that the current implementation does not allow for asymmetric values (i.e. similarity(a,b) is assumed to be equal to similarity(b,a))
+
+ preference : numpy.array of floats or float
+ Preference values, which the determine the number of clusters. If a single value is given, all the preference values are set to that. Otherwise, the list is used to set the preference values (one value per element, so the list must be of the same size as the number of elements)
+
+ lam : float
+ Floating point value that defines how much damping is applied to the solution at each iteration. Must be ]0,1]
+
+ max_iterations : int
+ Maximum number of iterations
+
+ convergence : int
+ Number of iterations in which the cluster centers must remain the same in order to reach convergence
+
+ noise : int
+ Whether to apply noise to the input s matrix, such there are no equal values. 1 is for yes, 0 is for no.
+
+
+ Returns:
+ ---------
+ elements : list of int or None
+ List of cluster-assigned elements, which can be used by encore.utils.ClustersCollection to generate Cluster objects. See these classes for more details.
+
+ """
+ cdef int cn = s.size
+ cdef float cpreference = preference
+
+ # Assign preference values to diagonal
+ try:
+ for i in xrange(s.size):
+ s[i,i] = preference[i]
+ except:
+ pass
+
+ if type(preference) == float:
+ for i in xrange(s.size):
+ s[i,i] = preference
+ else:
+ raise TypeError ("Preference should be of type float")
+
+ logging.info("Preference %3.2f: starting Affinity Propagation" % (preference))
+
+ # Prepare input and ouput arrays
+ cdef numpy.ndarray[numpy.float32_t, ndim=1] matndarray = numpy.ascontiguousarray(s._elements, dtype=numpy.float32)
+ cdef numpy.ndarray[long, ndim=1] clusters = numpy.zeros((s.size),dtype=long)
+
+ # run C module Affinity Propagation
+ iterations = caffinityprop.CAffinityPropagation( matndarray.data, cn, lam, max_iterations, convergence, noise, clusters.data)
+
+ # Provide warning in case of lack of convergence
+ if iterations == 0:
+ logging.info("Preference %3.2f: could not converge in %d iterations" % (preference, -iterations))
+ import warnings
+ warnings.warn("Clustering with preference {0:3.2f} did not fully converge in {1:d} iterations".format(preference, -iterations))
+
+ # Find centroids
+ centroids = numpy.unique(clusters)
+ for k in numpy.arange(centroids.shape[0]):
+ ii = numpy.where(clusters == centroids[k])[0]
+ small_mat = numpy.zeros((ii.shape[0], ii.shape[0]))
+ for ii1 in numpy.arange(ii.shape[0]):
+ for ii2 in numpy.arange(ii.shape[0]):
+ small_mat[ii1,ii2] = s[ ii[ii1], ii[ii2] ]
+ j = numpy.argmax(numpy.sum(small_mat, axis=0))
+
+ centroids[k] = ii[j]
+
+ # Similarity to centroids
+ S_centroids = numpy.zeros((s.size, centroids.shape[0]))
+ for line in numpy.arange(s.size):
+ for c in numpy.arange(centroids.shape[0]):
+ S_centroids[line,c] = s[line, centroids[c]]
+
+ # Center values for each observation
+ c = numpy.argmax(S_centroids, axis=1)
+
+ # Centroids should point to themselves
+ c[centroids] = numpy.arange(centroids.shape[0])
+
+ # Assign centroid indices to all observables
+ clusters = centroids[c]
+
+ logging.info("Preference %3.2f: converged in %d iterations" % (preference, iterations))
+
+ return clusters
+
diff --git a/package/MDAnalysis/analysis/encore/clustering/caffinityprop.pxd b/package/MDAnalysis/analysis/encore/clustering/caffinityprop.pxd
new file mode 100644
index 00000000000..830172994fd
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/caffinityprop.pxd
@@ -0,0 +1,37 @@
+# caffinityprop.pxd --- pxd file for affinityprop.pyx
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+cdef extern from *:
+ ctypedef char const_char "const char"
+
+cdef extern from "stdio.h":
+ #extern int printf (__const char *__restrict __format, ...);
+ int printf(const_char *, ...)
+
+cdef extern from "stdlib.h":
+ void *calloc (size_t COUNT, size_t ELTSIZE)
+
+cdef extern from "float.h":
+ enum: FLT_MAX
+
+cdef extern from "ap.h":
+ int trmIndex(int, int)
+ int sqmIndex(int, int, int)
+ float pwmax(float, float)
+ float pwmin(float, float)
+ float min(float*, int)
+ float max(float*, int)
+ int CAffinityPropagation(float*, int, float, int, int, bint, long*)
diff --git a/package/MDAnalysis/analysis/encore/clustering/cluster.py b/package/MDAnalysis/analysis/encore/clustering/cluster.py
new file mode 100644
index 00000000000..8b90b4eff86
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/cluster.py
@@ -0,0 +1,224 @@
+ # cluster.py --- Common function for calling clustering algorithms
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+clustering frontend --- :mod:`MDAnalysis.analysis.encore.clustering.cluster`
+============================================================================
+
+The module defines a function serving as front-end for various clustering
+algorithms, wrapping them to allow them to be used interchangably.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import numpy as np
+from ..utils import ParallelCalculation, merge_universes
+from .ClusterCollection import ClusterCollection
+from ..confdistmatrix import get_distance_matrix
+from . import ClusteringMethod
+
+
+def cluster(ensembles,
+ method = ClusteringMethod.AffinityPropagationNative(),
+ selection="name CA",
+ distance_matrix=None,
+ allow_collapsed_result=True,
+ ncores=1,
+ **kwargs):
+ """
+ Cluster frames from one or more ensembles, using one or more
+ clustering methods. The function optionally takes pre-calculated distances
+ matrices as an argument. Note that not all clustering procedure can work
+ directly on distance matrices, so the distance matrices might be ignored
+ for particular choices of method.
+
+
+ Parameters
+ ----------
+
+ ensembles : MDAnalysis.Universe, or list, or list of list thereof
+ The function takes either a single Universe object, a list of Universe
+ objects or a list of lists of Universe objects. If given a single
+ universe, it simply clusters the conformations in the trajectory. If
+ given a list of ensembles, it will merge them and cluster them together,
+ keeping track of the ensemble to which each of the conformations belong.
+ Finally, if passed a list of list of ensembles, the function will just
+ repeat the functionality just described - merging ensembles for each
+ ensemble in the outer loop.
+
+ method: encore.ClusteringMethod or list thereof, optional
+ A single or a list of instances of the Clustering classes from
+ the clustering module. A separate analysis will be run for each
+ method. Note that different parameters for the same clustering method
+ can be explored by adding different instances of the same clustering
+ class.
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ distance_matrix : encore.utils.TriangularMatrix or list thereof, optional
+ Distance matrix used for clustering. If this parameter
+ is not supplied the matrix will be calculated on the fly.
+ If several distance matrices are supplied, an analysis will be done
+ for each of them. The number of provided distance matrices should
+ match the number of provided ensembles.
+
+ allow_collapsed_result: bool, optional
+ Whether a return value of a list of one value should be collapsed
+ into just the value.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+
+ Returns
+ -------
+
+ list of ClustersCollection objects (or potentially a single
+ ClusteringCollection object if allow_collapsed_result is set to True)
+
+
+ Example
+ -------
+ Two ensembles are created as Universe object using a topology file and
+ two trajectories. The topology- and trajectory files used are obtained
+ from the MDAnalysis test suite for two different simulations of the protein
+ AdK. To run the examples see the module `Examples`_ for how to import the
+ files.
+ Here, we reduce cluster two ensembles ::
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> cluster_collection = encore.cluster([ens1,ens2])
+ >>> print cluster_collection
+
+ You can change the parameters of the clustering method by explicitly
+ specifying the method ::
+
+ >>> cluster_collection = \
+ encore.cluster( \
+ [ens1,ens2], \
+ method=encore.AffinityPropagationNative(preference=-2.))
+
+ Here is an illustration using DBSCAN algorithm, instead
+ of the default clustering method ::
+
+ >>> cluster_collection = \
+ encore.cluster( \
+ [ens1,ens2], \
+ method=encore.DBSCAN())
+
+ You can also combine multiple methods in one call ::
+
+ >>> cluster_collection = \
+ encore.cluster( \
+ [ens1,ens2], \
+ method=[encore.AffinityPropagationNative(preference=-1.), \
+ encore.AffinityPropagationNative(preference=-2.)])
+
+ """
+
+ # Internally, ensembles are always transformed to a list of lists
+ if ensembles is not None:
+ if not hasattr(ensembles, '__iter__'):
+ ensembles = [ensembles]
+
+ ensembles_list = ensembles
+ if not hasattr(ensembles[0], '__iter__'):
+ ensembles_list = [ensembles]
+
+ # Calculate merged ensembles and transfer to memory
+ merged_ensembles = []
+ for ensembles in ensembles_list:
+ # Transfer ensembles to memory
+ for ensemble in ensembles:
+ ensemble.transfer_to_memory()
+ merged_ensembles.append(merge_universes(ensembles))
+
+ methods = method
+ if not hasattr(method, '__iter__'):
+ methods = [method]
+
+ # Check whether any of the clustering methods can make use of a distance
+ # matrix
+ any_method_accept_distance_matrix = \
+ np.any([_method.accepts_distance_matrix for _method in methods])
+
+ # If distance matrices are provided, check that it matches the number
+ # of ensembles
+ if distance_matrix:
+ if not hasattr(distance_matrix, '__iter__'):
+ distance_matrix = [distance_matrix]
+ if ensembles is not None and \
+ len(distance_matrix) != len(merged_ensembles):
+ raise ValueError("Dimensions of provided list of distance matrices "
+ "does not match that of provided list of "
+ "ensembles: {0} vs {1}"
+ .format(len(distance_matrix),
+ len(merged_ensembles)))
+
+ else:
+ # Calculate distance matrices for all merged ensembles - if not provided
+ if any_method_accept_distance_matrix:
+ distance_matrix = []
+ for merged_ensemble in merged_ensembles:
+ distance_matrix.append(get_distance_matrix(merged_ensemble,
+ selection=selection,
+ **kwargs))
+
+ args = []
+ for method in methods:
+ if method.accepts_distance_matrix:
+ args += [(d,) for d in distance_matrix]
+ else:
+ for merged_ensemble in merged_ensembles:
+ coordinates = merged_ensemble.trajectory.timeseries(format="fac")
+
+ # Flatten coordinate matrix into n_frame x n_coordinates
+ coordinates = np.reshape(coordinates,
+ (coordinates.shape[0], -1))
+
+ args.append((coordinates,))
+
+ # Execute clustering procedure
+ pc = ParallelCalculation(ncores, methods, args)
+
+ # Run parallel calculation
+ results = pc.run()
+
+ # Keep track of which sample belongs to which ensembles
+ metadata = None
+ if ensembles is not None:
+ ensemble_assignment = []
+ for i, ensemble in enumerate(ensembles):
+ ensemble_assignment += [i+1]*len(ensemble.trajectory)
+ ensemble_assignment = np.array(ensemble_assignment)
+ metadata = {'ensemble_membership': ensemble_assignment}
+
+ # Create clusters collections from clustering results,
+ # one for each cluster. None if clustering didn't work.
+ ccs = [ClusterCollection(clusters[1][0],
+ metadata=metadata) for clusters in results]
+
+ if allow_collapsed_result and len(ccs) == 1:
+ ccs = ccs[0]
+
+ return ccs
\ No newline at end of file
diff --git a/package/MDAnalysis/analysis/encore/clustering/include/ap.h b/package/MDAnalysis/analysis/encore/clustering/include/ap.h
new file mode 100644
index 00000000000..e05bd02c71c
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/include/ap.h
@@ -0,0 +1,27 @@
+/*
+ap.h --- headers for ap.c
+Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+int trmIndex(int, int);
+
+int sqmIndex(int, int, int);
+
+float min(float*, int);
+
+float max(float*, int);
+
+int CAffinityPropagation(float*, int, float, int, int, int, float*);
diff --git a/package/MDAnalysis/analysis/encore/clustering/src/ap.c b/package/MDAnalysis/analysis/encore/clustering/src/ap.c
new file mode 100644
index 00000000000..10814f56de1
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/clustering/src/ap.c
@@ -0,0 +1,273 @@
+/*
+ap.c --- C implementation of the Affinity Propagation clustering algorithm
+Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+*/
+
+#include
+#include
+#include
+
+/* Helper functions */
+
+inline int trmIndex(int row, int col) { // array index for triangular matrix
+ return (row) > (col) ? ((row)+1)*(row)/2+(col) : ((col)+1)*(col)/2+(row);
+}
+
+inline int sqmIndex(int colsn, int row, int col) { // array index for square matrix
+ return row*colsn + col;
+}
+
+float min(float * values, int length) { //array min
+ float min = values[0];
+ for (int i=1;i max) {
+ max = values[i];
+ }
+ }
+ return max;
+}
+
+void printarray(float* array, int lenarray) { //print an array, for debug purposes
+ for (int i=0;i max1) {
+ max2 = max1;
+ max1 = tmp;
+ }
+ else if (tmp > max2) {
+ max2 = tmp;
+ }
+ }
+ for (int k=0;k 0 of column k
+ tmp = r[sqmIndex(n,j,k)];
+ if (j!=k) {
+ if (tmp > 0.0)
+ tmpsum = tmpsum + tmp; // if j!=k (r(j,k)): if > 0, sum it
+ }
+ else
+ tmpsum = tmpsum + tmp; // if j == k (r(k,k)): always sum it.
+ // we will have to remove the j==i (r(i,k)) case, for every i.
+ }
+ //printf("tempsum %1.2f\n",tmpsum);
+ for (int i=0;i 0.0)
+ this_tmpsum = this_tmpsum - tmp; //subtract r(i,k)
+ //printf("tmpsum2 %1.2f\n",tmpsum);
+ if (this_tmpsum < 0.0)
+ a[sqm_idx] = lambda*a[sqm_idx] + lamprev*this_tmpsum;
+ else
+ a[sqm_idx] = lambda*a[sqm_idx];
+ }
+ else // we don't need to remove the r(i,k) case, BUT wee need to remove to remove the r(k,k) case
+ a[sqm_idx] = lambda*a[sqm_idx] + lamprev*(this_tmpsum - r[sqmIndex(n,k,k)]);
+ }
+ }
+
+ //printf("%s","a\n");
+ //printsqmatrix(a,n);
+
+ //Check for convergence
+
+ int* tmp_exemplars = old_exemplars; // current exemplars become old, before calculating the new ones. Pointer swap - fast and convenient
+ old_exemplars = exemplars;
+ exemplars = tmp_exemplars;
+
+ has_cluster = 0;
+ for (int i=0;i 0.0) {
+ exemplars[i] = 1;
+ has_cluster = 1;
+ }
+ else
+ exemplars[i] = 0;
+ }
+
+ if (has_cluster != 0) {
+ conv_count++;
+ for (j=0;j 0) {
+ exemplars[n_clusters] = i;
+ n_clusters++;
+ }
+ }
+
+ for (int i=0;i= maxsim) {
+ clusters[i] = k;
+ maxsim = tmpsum;
+ }
+ }
+ }
+
+ for (int i=0;i.
+
+"""
+Distance Matrix calculation --- :mod:`MDAnalysis.analysis.ensemble.confdistmatrix`
+=====================================================================
+
+
+The module contains a base class to easily compute, using parallelization and
+shared memory, matrices of conformational distance between the structures
+stored in an Ensemble. A class to compute an RMSD matrix in such a way is also
+available.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import numpy as np
+from multiprocessing import Process, Array, RawValue
+from ctypes import c_float
+from getpass import getuser
+from socket import gethostname
+from datetime import datetime
+from time import sleep
+import logging
+
+from ...core.AtomGroup import Universe
+
+from ..align import rotation_matrix
+
+from .cutils import PureRMSD
+from .utils import TriangularMatrix, trm_indeces, \
+ AnimatedProgressBar
+
+
+
+def conformational_distance_matrix(ensemble,
+ conf_dist_function, selection="",
+ superimposition_selection="", ncores=1, pairwise_align=True,
+ mass_weighted=True, metadata=True, *args, **kwargs):
+ """
+ Run the conformational distance matrix calculation.
+ args and kwargs are passed to conf_dist_function.
+
+ Parameters
+ ----------
+
+ ensemble : encore.Ensemble.Ensemble object
+ Ensemble object for which the conformational distance matrix will
+ be computed.
+
+ conf_dist_function : function object
+ Function that fills the matrix with conformational distance
+ values. See set_rmsd_matrix_elements for an example.
+
+ pairwise_align : bool
+ Whether to perform pairwise alignment between conformations.
+ Default is True (do the superimposition)
+
+ mass_weighted : bool
+ Whether to perform mass-weighted superimposition and metric
+ calculation. Default is True.
+
+ metadata : bool
+ Whether to build a metadata dataset for the calculated matrix.
+ Default is True.
+
+ ncores : int
+ Number of cores to be used for parallel calculation
+ Default is 1.
+
+ Returns
+ -------
+
+ conf_dist_matrix : encore.utils.TriangularMatrix object
+ Conformational distance matrix in triangular representation.
+
+ """
+
+ # Decide how many cores have to be used. Since the main process is
+ # stopped while the workers do their job, ncores workers will be
+ # spawned.
+
+ if ncores < 1:
+ ncores = 1
+
+ # framesn: number of frames
+ framesn = len(ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection), format='fac'))
+
+ # Prepare metadata recarray
+ if metadata:
+ metadata = np.array([(gethostname(),
+ getuser(),
+ str(datetime.now()),
+ ensemble.filename,
+ framesn,
+ pairwise_align,
+ selection,
+ mass_weighted)],
+ dtype=[('host', object),
+ ('user', object),
+ ('date', object),
+ ('topology file', object),
+ ('number of frames', int),
+ ('pairwise superimposition', bool),
+ ('superimposition subset', object),
+ ('mass-weighted', bool)])
+
+ # Prepare alignment subset coordinates as necessary
+
+ rmsd_coordinates = ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection),
+ format='fac')
+
+ if pairwise_align:
+ if superimposition_selection:
+ subset_selection = superimposition_selection
+ else:
+ subset_selection = selection
+
+ fitting_coordinates = ensemble.trajectory.timeseries(
+ ensemble.select_atoms(subset_selection),
+ format='fac')
+ else:
+ fitting_coordinates = None
+
+ # Prepare masses as necessary
+ if mass_weighted:
+ masses = ensemble.select_atoms(selection).masses
+ if pairwise_align:
+ subset_masses = ensemble.select_atoms(subset_selection).masses
+ else:
+ subset_masses = None
+ else:
+ masses = np.ones((ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection))[0].shape[0]))
+ if pairwise_align:
+ subset_masses = np.ones((fit_coords[0].shape[0]))
+ else:
+ subset_masses = None
+
+ # matsize: number of elements of the triangular matrix, diagonal
+ # elements included.
+ matsize = framesn * (framesn + 1) / 2
+
+ # Calculate the number of matrix elements that each core has to
+ # calculate as equally as possible.
+ if ncores > matsize:
+ ncores = matsize
+ runs_per_worker = [matsize / int(ncores) for x in range(ncores)]
+ unfair_work = matsize % ncores
+ for i in range(unfair_work):
+ runs_per_worker[i] += 1
+
+ # Splice the matrix in ncores segments. Calculate the first and the
+ # last (i,j) matrix elements of the slices that will be assigned to
+ # each worker. Each of them will proceed in a column-then-row order
+ # (e.g. 0,0 1,0 1,1 2,0 2,1 2,2 ... )
+ i = 0
+ a = [0, 0]
+ b = [0, 0]
+ tasks_per_worker = []
+ for n,r in enumerate(runs_per_worker):
+ while i * (i - 1) / 2 < np.sum(runs_per_worker[:n + 1]):
+ i += 1
+ b = [i - 2,
+ np.sum(runs_per_worker[0:n + 1]) - (i - 2) * (i - 1) / 2 - 1]
+ tasks_per_worker.append((tuple(a), tuple(b)))
+ if b[0] == b[1]:
+ a[0] = b[0] + 1
+ a[1] = 0
+ else:
+ a[0] = b[0]
+ a[1] = b[1] + 1
+
+ # Allocate for output matrix
+ distmat = Array(c_float, matsize)
+
+ # Prepare progress bar stuff and run it
+ pbar = AnimatedProgressBar(end=matsize, width=80)
+ partial_counters = [RawValue('i', 0) for i in range(ncores)]
+
+ # Initialize workers. Simple worker doesn't perform fitting,
+ # fitter worker does.
+
+ workers = [Process(target=conf_dist_function, args=(
+ tasks_per_worker[i],
+ rmsd_coordinates,
+ distmat,
+ masses,
+ fitting_coordinates,
+ subset_masses,
+ partial_counters[i],
+ args,
+ kwargs)) for i in range(ncores)]
+
+ # Start & join the workers
+ for w in workers:
+ w.start()
+ for w in workers:
+ w.join()
+
+ # When the workers have finished, return a TriangularMatrix object
+ return TriangularMatrix(distmat, metadata=metadata)
+
+
+def set_rmsd_matrix_elements(tasks, coords, rmsdmat, masses, fit_coords=None,
+ fit_masses=None, pbar_counter=None, *args, **kwargs):
+
+ '''
+ RMSD Matrix calculator
+
+ Parameters
+ ----------
+
+ tasks : iterator of int of length 2
+ Given a triangular matrix, this function will calculate RMSD
+ values from element tasks[0] to tasks[1]. Since the matrix
+ is triangular, the trm_indeces matrix automatically
+ calculates the corrisponding i,j matrix indices.
+ The matrix is written as an array in a row-major
+ order (see the TriangularMatrix class for details).
+
+ If fit_coords and fit_masses are specified, the structures
+ will be superimposed before calculating RMSD, and fit_coords and fit_masses
+ will be used to place both structures at their center of mass and
+ compute the rotation matrix. In this case, both fit_coords and fit_masses
+ must be specified.
+
+ coords : numpy.array
+ Array of the ensemble coordinates
+
+ masses : numpy.array
+ Array of atomic masses, having the same order as the
+ coordinates array
+
+ rmsdmat : encore.utils.TriangularMatrix
+ Memory-shared triangular matrix object
+
+ fit_coords : numpy.array or None
+ Array of the coordinates used for fitting
+
+ fit_masses : numpy.array
+ Array of atomic masses, having the same order as the
+ fit_coords array
+
+ pbar_counter : multiprocessing.RawValue
+ Thread-safe shared value. This counter is updated at
+ every cycle and used to evaluate the progress of
+ each worker in a parallel calculation.
+ '''
+
+
+ if fit_coords is None and fit_masses is None:
+ for i, j in trm_indeces(tasks[0], tasks[1]):
+ summasses = np.sum(masses)
+ rmsdmat[(i + 1) * i / 2 + j] = PureRMSD(coords[i].astype(np.float64),
+ coords[j].astype(np.float64),
+ coords[j].shape[0],
+ masses,
+ summasses)
+
+ elif fit_coords is not None and fit_coords is not None:
+ for i, j in trm_indeces(tasks[0], tasks[1]):
+ summasses = np.sum(masses)
+ subset_weights = np.asarray(fit_masses) / np.mean(fit_masses)
+ com_i = np.average(fit_coords[i], axis=0,
+ weights=fit_masses)
+ translated_i = coords[i] - com_i
+ subset1_coords = fit_coords[i] - com_i
+ com_j = np.average(fit_coords[j], axis=0,
+ weights=fit_masses)
+ translated_j = coords[j] - com_j
+ subset2_coords = fit_coords[j] - com_j
+ rotamat = rotation_matrix(subset1_coords, subset2_coords,
+ subset_weights)[0]
+ rotated_i = np.transpose(np.dot(rotamat, np.transpose(translated_i)))
+ rmsdmat[(i + 1) * i / 2 + j] = PureRMSD(
+ rotated_i.astype(np.float64), translated_j.astype(np.float64),
+ coords[j].shape[0], masses, summasses)
+
+ else:
+ raise TypeError("Both fit_coords and fit_masses must be specified \
+ if one of them is given")
+
+ if pbar_counter is not None:
+ pbar_counter.value += 1
+
+def pbar_updater(pbar, pbar_counters, max_val, update_interval=0.2):
+ '''Method that updates and prints the progress bar, upon polling
+ progress status from workers.
+
+ Attributes
+ -----------
+
+ pbar : encore.utils.AnimatedProgressBar object
+ Progress bar object
+
+ pbar_counters : list of multiprocessing.RawValue
+ List of counters. Each worker is given a counter, which is updated
+ at every cycle. In this way the _pbar_updater process can
+ asynchronously fetch progress reports.
+
+ max_val : int
+ Total number of matrix elements to be calculated
+
+ update_interval : float
+ Number of seconds between progress bar updates
+
+ '''
+
+ val = 0
+ while val < max_val:
+ val = 0
+ for c in pbar_counters:
+ val += c.value
+ pbar.update(val)
+ pbar.show_progress()
+ sleep(update_interval)
+
+
+
+def get_distance_matrix(ensemble,
+ selection="name CA",
+ load_matrix=None,
+ save_matrix=None,
+ superimpose=True,
+ superimposition_subset="name CA",
+ mass_weighted=True,
+ ncores=1,
+ *conf_dist_args,
+ **conf_dist_kwargs):
+ """
+ Retrieves or calculates the conformational distance (RMSD)
+ matrix. The distance matrix is calculated between all the frames of all
+ the :class:`~MDAnalysis.core.AtomGroup.Universe` objects given as input.
+ The order of the matrix elements depends on the order of the coordinates
+ of the ensembles and on the order of the input ensembles themselves,
+ therefore the order of the input list is significant.
+
+ The distance matrix can either be calculated from input ensembles or
+ loaded from an input numpy binary file.
+
+ Please notice that the .npz file does not contain a bidimensional array,
+ but a flattened representation that is meant to represent the elements of
+ an encore.utils.TriangularMatrix object.
+
+
+ Parameters
+ ----------
+
+ ensemble : Universe
+
+ selection : str
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ load_matrix : str, optional
+ Load similarity/dissimilarity matrix from numpy binary file instead
+ of calculating it (default is None). A filename is required.
+
+ save_matrix : bool, optional
+ Save calculated matrix as numpy binary file (default is None). A
+ filename is required.
+
+ superimpose : bool, optional
+ Whether to superimpose structures before calculating distance
+ (default is True).
+
+ superimposition_subset : str, optional
+ Group for superimposition using MDAnalysis selection syntax
+ (default is CA atoms: "name CA")
+
+ mass_weighted : bool, optional
+ calculate a mass-weighted RMSD (default is True). If set to False
+ the superimposition will also not be mass-weighted.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1)
+
+ Returns
+ -------
+
+ confdistmatrix : encore.utils.TriangularMatrix
+ Conformational distance matrix. .
+ """
+
+ # Load the matrix if required
+ if load_matrix:
+ logging.info(
+ " Loading similarity matrix from: {0}".format(load_matrix))
+ confdistmatrix = \
+ TriangularMatrix(
+ size=ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection),
+ format='fac').shape[0],
+ loadfile=load_matrix)
+ logging.info(" Done!")
+ for key in confdistmatrix.metadata.dtype.names:
+ logging.info(" {0} : {1}".format(
+ key, str(confdistmatrix.metadata[key][0])))
+
+ # Check matrix size for consistency
+ if not confdistmatrix.size == \
+ ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection),
+ format='fac').shape[0]:
+ logging.error(
+ "ERROR: The size of the loaded matrix and of the ensemble"
+ " do not match")
+ return None
+
+
+ # Calculate the matrix
+ else:
+ logging.info(
+ " Perform pairwise alignment: {0}".format(str(superimpose)))
+ logging.info(" Mass-weighted alignment and RMSD: {0}"
+ .format(str(mass_weighted)))
+ if superimpose:
+ logging.info(
+ " Atoms subset for alignment: {0}"
+ .format(superimposition_subset))
+ logging.info(" Calculating similarity matrix . . .")
+
+ # Use superimposition subset, if necessary. If the pairwise alignment
+ # is not required, it will not be performed anyway.
+ confdistmatrix = conformational_distance_matrix(ensemble,
+ conf_dist_function=set_rmsd_matrix_elements,
+ selection=selection,
+ pairwise_align=superimpose,
+ mass_weighted=mass_weighted,
+ ncores=ncores,
+ *conf_dist_args,
+ kwargs=conf_dist_kwargs)
+
+ logging.info(" Done!")
+
+ if save_matrix:
+ confdistmatrix.savez(save_matrix)
+
+ return confdistmatrix
diff --git a/package/MDAnalysis/analysis/encore/covariance.py b/package/MDAnalysis/analysis/encore/covariance.py
new file mode 100644
index 00000000000..86f55f2f30b
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/covariance.py
@@ -0,0 +1,245 @@
+# covariance.py --- Covariance matrix calculations
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+Covariance calculation --- :mod:`encore.covariance`
+=====================================================================
+
+The module contains functions to estimate the covariance matrix of
+an ensemble of structures.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+"""
+
+import numpy as np
+
+def ml_covariance_estimator(coordinates, reference_coordinates=None):
+ """
+ Standard maximum likelihood estimator of the covariance matrix.
+
+ Parameters
+ ----------
+
+ coordinates : numpy.array
+ Flattened array of coordiantes
+
+ reference_coordinates : numpy.array
+ Optional reference to use instead of mean
+
+ Returns
+ -------
+
+ cov_mat : numpy.array
+ Estimate of covariance matrix
+
+ """
+
+ if reference_coordinates is not None:
+
+ # Offset from reference
+ coordinates_offset = coordinates - reference_coordinates
+
+ else:
+ # Normal covariance calculation: distance to the average
+ coordinates_offset = coordinates - np.average(coordinates, axis=0)
+
+ # Calculate covariance manually
+ coordinates_cov = np.zeros((coordinates.shape[1],
+ coordinates.shape[1]))
+ for frame in coordinates_offset:
+ coordinates_cov += np.outer(frame, frame)
+ coordinates_cov /= coordinates.shape[0]
+
+ return coordinates_cov
+
+def shrinkage_covariance_estimator( coordinates,
+ reference_coordinates=None,
+ shrinkage_parameter=None):
+ """
+ Shrinkage estimator of the covariance matrix using the method described in
+
+ Improved Estimation of the Covariance Matrix of Stock Returns With an
+ Application to Portfolio Selection. Ledoit, O.; Wolf, M., Journal of
+ Empirical Finance, 10, 5, 2003
+
+ This implementation is based on the matlab code made available by Olivier
+ Ledoit on his website:
+ http://www.ledoit.net/ole2_abstract.htm
+
+ Parameters
+ ----------
+
+ coordinates : numpy.array
+ Flattened array of coordinates
+
+ reference_coordinates: numpy.array
+ Optional reference to use instead of mean
+
+ shrinkage_parameter: None or float
+ Optional shrinkage parameter
+
+ Returns
+ --------
+
+ cov_mat : nump.array
+ Covariance matrix
+ """
+
+ x = coordinates
+ t = x.shape[0]
+ n = x.shape[1]
+
+ mean_x = np.average(x, axis=0)
+
+ # Use provided coordinates as "mean" if provided
+ if reference_coordinates is not None:
+ mean_x = reference_coordinates
+
+ x = x - mean_x
+ xmkt = np.average(x, axis=1)
+
+ # Call maximum likelihood estimator (note the additional column)
+ sample = ml_covariance_estimator(np.hstack([x, xmkt[:, np.newaxis]]), 0) \
+ * (t-1)/float(t)
+
+ # Split covariance matrix into components
+ covmkt = sample[0:n, n]
+ varmkt = sample[n, n]
+ sample = sample[:n, :n]
+
+ # Prior
+ prior = np.outer(covmkt, covmkt)/varmkt
+ prior[np.ma.make_mask(np.eye(n))] = np.diag(sample)
+
+ # If shrinkage parameter is not set, estimate it
+ if shrinkage_parameter is None:
+
+ # Frobenius norm
+ c = np.linalg.norm(sample - prior, ord='fro')**2
+
+ y = x**2
+ p = 1/float(t)*np.sum(np.dot(np.transpose(y), y))\
+ - np.sum(np.sum(sample**2))
+ rdiag = 1/float(t)*np.sum(np.sum(y**2))\
+ - np.sum(np.diag(sample)**2)
+ z = x * np.repeat(xmkt[:, np.newaxis], n, axis=1)
+ v1 = 1/float(t) * np.dot(np.transpose(y), z) \
+ - np.repeat(covmkt[:, np.newaxis], n, axis=1)*sample
+ roff1 = (np.sum(
+ v1*np.transpose(
+ np.repeat(
+ covmkt[:, np.newaxis], n, axis=1)
+ )
+ )/varmkt -
+ np.sum(np.diag(v1)*covmkt)/varmkt)
+ v3 = 1/float(t)*np.dot(np.transpose(z), z) - varmkt*sample
+ roff3 = (np.sum(v3*np.outer(covmkt, covmkt))/varmkt**2 -
+ np.sum(np.diag(v3)*covmkt**2)/varmkt**2)
+ roff = 2*roff1-roff3
+ r = rdiag+roff
+
+ # Shrinkage constant
+ k = (p-r)/c
+ shrinkage_parameter = max(0, min(1, k/float(t)))
+
+ # calculate covariance matrix
+ sigma = shrinkage_parameter*prior+(1-shrinkage_parameter)*sample
+
+ return sigma
+
+
+
+
+def covariance_matrix(ensemble,
+ selection="name CA",
+ estimator=shrinkage_covariance_estimator,
+ mass_weighted=True,
+ reference=None):
+ """
+ Calculates (optionally mass weighted) covariance matrix
+
+ Parameters
+ ----------
+
+ ensemble : Ensemble object
+ The structural ensemble
+
+ selection : str
+ Atom selection string in the MDAnalysis format.
+
+ estimator : function
+ Function that estimates the covariance matrix. It requires at least
+ a "coordinates" numpy array (of shape (N,M,3), where N is the number
+ of frames and M the number of atoms). See ml_covariance_estimator and
+ shrinkage_covariance_estimator for reference.
+
+
+ mass_weighted : bool
+ Whether to do a mass-weighted analysis (default is True)
+
+ reference : MDAnalysis.Universe object
+ Use the distances to a specific reference structure rather than the
+ distance to the mean.
+
+ Returns
+ -------
+
+ cov_mat : numpy.array
+ Covariance matrix
+
+ """
+
+ # Extract coordinates from ensemble
+ coordinates = ensemble.trajectory.timeseries(
+ ensemble.select_atoms(selection),
+ format='fac')
+
+ # Flatten coordinate matrix into n_frame x n_coordinates
+ coordinates = np.reshape(coordinates, (coordinates.shape[0], -1))
+
+ # Extract coordinates from reference structure, if specified
+ reference_coordinates = None
+ if reference:
+
+ # Select the same atoms in reference structure
+ reference_atom_selection = reference.select_atoms(
+ ensemble.get_atom_selection_string())
+ reference_coordinates = reference_atom_selection.atoms.coordinates()
+
+ # Flatten reference coordinates
+ reference_coordinates = reference_coordinates.flatten()
+
+ sigma = estimator(coordinates, reference_coordinates)
+
+
+ # Optionally correct with mass-weighting
+ if mass_weighted:
+ # Calculate mass-weighted covariance matrix
+
+ if selection:
+ masses = np.repeat(ensemble.select_atoms(selection).masses, 3)
+ else:
+ masses = np.repeat(ensemble.atoms.masses, 3)
+
+ mass_matrix = np.sqrt(np.identity(len(masses))*masses)
+ sigma = np.dot(mass_matrix, np.dot(sigma, mass_matrix))
+
+ return sigma
diff --git a/package/MDAnalysis/analysis/encore/cutils.pyx b/package/MDAnalysis/analysis/encore/cutils.pyx
new file mode 100644
index 00000000000..b5c813fe690
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/cutils.pyx
@@ -0,0 +1,52 @@
+# cutils.pyx --- C-compiled Utility functions for encore package
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+#cython embedsignature=True
+
+"""
+Mixed Cython utils for ENCORE
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+"""
+
+
+import numpy as np
+cimport numpy as np
+import cython
+from libc.math cimport sqrt
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def PureRMSD(np.ndarray[np.float64_t,ndim=2] coordsi,
+ np.ndarray[np.float64_t,ndim=2] coordsj,
+ int atomsn,
+ np.ndarray[np.float64_t,ndim=1] masses,
+ double summasses):
+
+ cdef int k
+ cdef double normsum, totmasses
+
+ normsum = 0.0
+
+ for k in xrange(atomsn):
+ normsum += masses[k]*((coordsi[k,0]-coordsj[k,0])**2 + (coordsi[k,1]-coordsj[k,1])**2 + (coordsi[k,2]-coordsj[k,2])**2)
+ return sqrt(normsum/summasses)
\ No newline at end of file
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/DimensionalityReductionMethod.py b/package/MDAnalysis/analysis/encore/dimensionality_reduction/DimensionalityReductionMethod.py
new file mode 100644
index 00000000000..e1f77815011
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/DimensionalityReductionMethod.py
@@ -0,0 +1,193 @@
+# DimensionalityReductionMethod.py --- Interface classes to various
+# dimensionality reduction algorithms
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+dimensionality reduction frontend --- :mod:`MDAnalysis.analysis.encore.clustering.DimensionalityReductionMethod`
+================================================================================================================
+
+The module defines classes for interfacing to various dimensionality reduction
+algorithms. One has been implemented natively, and will always be available,
+while others are available only if scikit-learn is installed
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+
+import logging
+import warnings
+
+# Import native affinity propagation implementation
+from . import stochasticproxembed
+
+# Attempt to import scikit-learn clustering algorithms
+try:
+ import sklearn.decomposition
+except ImportError:
+ sklearn = None
+ msg = "sklearn.decomposition could not be imported: some functionality will"\
+ "not be available in encore.dimensionality_reduction()"
+ warnings.warn(msg, category=ImportWarning)
+ logging.warn(msg)
+ del msg
+
+
+class DimensionalityReductionMethod (object):
+ """
+ Base class for any Dimensionality Reduction Method
+ """
+
+ # Whether the method accepts a distance matrix
+ accepts_distance_matrix=True
+
+ def __call__(self, x):
+ """
+ Parameters
+ ----------
+
+ x
+ either trajectory coordinate data (np.array) or an
+ encore.utils.TriangularMatrix, encoding the conformational
+ distance matrix
+
+
+ Returns
+ -------
+ numpy.array
+ coordinates in reduced space
+
+ """
+ raise NotImplementedError("Class {0} doesn't implement __call__()"
+ .format(self.__class__.__name__))
+
+
+class StochasticProximityEmbeddingNative(DimensionalityReductionMethod):
+ """
+ Interface to the natively implemented Affinity propagation procedure.
+ """
+ def __init__(self,
+ dimension = 2,
+ distance_cutoff = 1.5,
+ min_lam = 0.1,
+ max_lam = 2.0,
+ ncycle = 100,
+ nstep = 10000,):
+ """
+ Parameters
+ ----------
+
+ dimension : int
+ Number of dimensions to which the conformational space will be
+ reduced to (default is 3).
+
+ min_lam : float, optional
+ Final lambda learning rate (default is 0.1).
+
+ max_lam : float, optional
+ Starting lambda learning rate parameter (default is 2.0).
+
+ ncycle : int, optional
+ Number of cycles per run (default is 100). At the end of every
+ cycle, lambda is updated.
+
+ nstep : int, optional
+ Number of steps per cycle (default is 10000)
+
+ """
+ self.dimension = dimension
+ self.distance_cutoff = distance_cutoff
+ self.min_lam = min_lam
+ self.max_lam = max_lam
+ self.ncycle = ncycle
+ self.nstep = nstep
+ self.stressfreq = -1
+
+ def __call__(self, distance_matrix):
+ """
+ Parameters
+ ----------
+
+ distance_matrix : encore.utils.TriangularMatrix
+ conformational distance matrix
+
+
+ Returns
+ -------
+ numpy.array
+ coordinates in reduced space
+
+ """
+ final_stress, coordinates = \
+ stochasticproxembed.StochasticProximityEmbedding().run(
+ s=distance_matrix,
+ rco=self.distance_cutoff,
+ dim=self.dimension,
+ minlam = self.min_lam,
+ maxlam = self.max_lam,
+ ncycle = self.ncycle,
+ nstep = self.nstep,
+ stressfreq = self.stressfreq
+ )
+ return coordinates, {"final_stress": final_stress}
+
+
+
+if sklearn:
+
+ class PrincipleComponentAnalysis(DimensionalityReductionMethod):
+ """
+ Interface to the PCA dimensionality reduction method implemented in
+ sklearn.
+ """
+
+ # Whether the method accepts a distance matrix
+ accepts_distance_matrix = False
+
+ def __init__(self,
+ dimension = 2,
+ **kwargs):
+ """
+ Parameters
+ ----------
+
+ dimension : int
+ Number of dimensions to which the conformational space will be
+ reduced to (default is 3).
+ """
+ self.pca = sklearn.decomposition.PCA(n_components=dimension,
+ **kwargs)
+
+ def __call__(self, coordinates):
+ """
+ Parameters
+ ----------
+
+ coordinates : np.array
+ trajectory atom coordinates
+
+
+ Returns
+ -------
+ numpy.array
+ coordinates in reduced space
+ """
+ coordinates = self.pca.fit_transform(coordinates)
+ return coordinates.T, {}
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/__init__.py b/package/MDAnalysis/analysis/encore/dimensionality_reduction/__init__.py
new file mode 100644
index 00000000000..88c4cdc71c7
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/__init__.py
@@ -0,0 +1,6 @@
+from . import DimensionalityReductionMethod
+
+__all__ = [
+ 'DimensionalityReductionMethod.StochasticProximityEmbeddingNative',
+ 'DimensionalityReductionMethod.PrincipleComponentAnalysis'
+]
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/cstochasticproxembed.pxd b/package/MDAnalysis/analysis/encore/dimensionality_reduction/cstochasticproxembed.pxd
new file mode 100644
index 00000000000..0c7ade18d66
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/cstochasticproxembed.pxd
@@ -0,0 +1,47 @@
+# cstochasticproxembed.pxd --- pxd file for stochasticproxembed.pyx
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+cdef extern from *:
+ ctypedef char const_char "const char"
+
+cdef extern from "stdio.h":
+ int printf(const_char *, ...)
+
+cdef extern from "stdlib.h":
+ ctypedef struct time_t:
+ pass
+ time_t time(time_t*)
+ void* malloc (size_t, size_t)
+ void* realloc (void*, size_t)
+ void srand(unsigned int)
+ int rand()
+
+cdef extern from "math.h":
+ double sqrt(double)
+
+cdef extern from "spe.h":
+ ctypedef struct IVWrapper:
+ pass
+ ctypedef void* empty
+
+ int trmIndex(int, int)
+ double ed(double*, int, int, int)
+ double stress(double, double, int, int)
+ double neighbours_stress(double, double, int, int, double)
+ int neighbours(double, int, double, int*, int*, int*)
+ int* nearest_neighbours(double*, int, int)
+ int cmp_ivwrapper(void*,void*)
+ double CStochasticProximityEmbedding(double*, double*, double, int, int, double, double, int, int, int)
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/include/spe.h b/package/MDAnalysis/analysis/encore/dimensionality_reduction/include/spe.h
new file mode 100644
index 00000000000..56a89e9cf52
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/include/spe.h
@@ -0,0 +1,47 @@
+/*
+spe.h --- header file for spe.c
+Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+int trmIndex(int, int);
+
+double ed(double*, int, int, int);
+
+double stress(double, double, int, int);
+
+double neighbours_stress(double, double, int, int, double);
+
+int neighbours(double, int, double, int*, int*, int*);
+
+int cmp_ivwrapper(const void*, const void*);
+
+int nearest_neighbours(double*, int, int);
+
+double CStochasticProximityEmbedding(
+ double*,
+ double*,
+ double,
+ int,
+ int,
+ double,
+ double,
+ int,
+ int,
+ int);
+
+
+
+
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/reduce_dimensionality.py b/package/MDAnalysis/analysis/encore/dimensionality_reduction/reduce_dimensionality.py
new file mode 100644
index 00000000000..9fc3e33e913
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/reduce_dimensionality.py
@@ -0,0 +1,231 @@
+# reduce_dimensionality.py --- Common function for calling dimensionality
+# reduction algorithms
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+dimensionality reduction frontend --- :mod:`MDAnalysis.analysis.encore.dimensionality_reduction.reduce_dimensionality`
+=====================================================================
+
+The module defines a function serving as front-end for various dimensionality
+reduction algorithms, wrapping them to allow them to be used interchangably.
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+"""
+import numpy as np
+from ..confdistmatrix import get_distance_matrix
+from ..utils import ParallelCalculation, merge_universes
+from ..dimensionality_reduction.DimensionalityReductionMethod import (
+ StochasticProximityEmbeddingNative)
+
+
+def reduce_dimensionality(ensembles,
+ method=StochasticProximityEmbeddingNative(),
+ selection="name CA",
+ distance_matrix=None,
+ allow_collapsed_result=True,
+ ncores=1,
+ **kwargs):
+ """
+ Reduce dimensions in frames from one or more ensembles, using one or more
+ dimensionality reduction methods. The function optionally takes
+ pre-calculated distances matrices as an argument. Note that not all
+ dimensionality reduction procedure can work directly on distance matrices,
+ so the distance matrices might be ignored for particular choices of
+ method.
+
+
+ Parameters
+ ----------
+
+ ensembles : MDAnalysis.Universe, or list or list of list thereof
+ The function takes either a single Universe object, a list of Universe
+ objects or a list of lists of Universe objects. If given a single
+ universe, it simply works on the conformations in the trajectory. If
+ given a list of ensembles, it will merge them and analyse them together,
+ keeping track of the ensemble to which each of the conformations belong.
+ Finally, if passed a list of list of ensembles, the function will just
+ repeat the functionality just described - merging ensembles for each
+ ensemble in the outer loop.
+
+ method : MDAnalysis.analysis.encore.dimensionality_reduction.DimensionalityReductionMethod or list
+ A single or a list of instances of the DimensionalityReductionMethod
+ classes from the dimensionality_reduction module. A separate analysis
+ will be run for each method. Note that different parameters for the
+ same method can be explored by adding different instances of
+ the same dimensionality reduction class. Options are Stochastic
+ Proximity Embedding or Principle Component Analysis.
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format (default is "name CA")
+
+ distance_matrix : encore.utils.TriangularMatrix, optional
+ Distance matrix for stochastic proximity embedding. If this parameter
+ is not supplied the matrix will be calculated on the fly (default) .
+ If several distance matrices are supplied, an analysis will be done
+ for each of them. The number of provided distance matrices should
+ match the number of provided ensembles.
+
+ allow_collapsed_result: bool, optional
+ Whether a return value of a list of one value should be collapsed
+ into just the value (default = True).
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+
+ Returns
+ -------
+
+ list of coordinate arrays in the reduced dimensions (or potentially a single
+ coordinate array object if allow_collapsed_result is set to True)
+
+
+ Example
+ -------
+ Two ensembles are created as Universe object using a topology file and
+ two trajectories. The topology- and trajectory files used are obtained
+ from the MDAnalysis test suite for two different simulations of the protein
+ AdK. To run the examples see the module `Examples`_ for how to import the
+ files.
+ Here, we reduce two ensembles to two dimensions, and plot the result using
+ matplotlib: ::
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> coordinates, details = encore.reduce_dimensionality([ens1,ens2])
+ >>> plt.scatter(coordinates[0], coordinates[1], \
+ color=[["red", "blue"][m-1] for m \
+ in details["ensemble_membership"]])
+ Note how we extracted information about which conformation belonged to
+ which ensemble from the details variable.
+
+ You can change the parameters of the dimensionality reduction method
+ by explicitly specifying the method ::
+ >>> coordinates, details = \
+ encore.reduce_dimensionality([ens1,ens2], \
+ method=encore.StochasticProximityEmbeddingNative(dimension=3))
+
+ Here is an illustration using Principle Component Analysis, instead
+ of the default dimensionality reduction method ::
+
+ >>> coordinates, details = \
+ encore.reduce_dimensionality( \
+ [ens1,ens2], \
+ method=encore.PrincipleComponentAnalysis(dimension=2))
+
+ You can also combine multiple methods in one call ::
+
+ >>> coordinates, details = \
+ encore.reduce_dimensionality( \
+ [ens1,ens2], \
+ method=[encore.PrincipleComponentAnalysis(dimension=2), \
+ encore.StochasticProximityEmbeddingNative(dimension=2)])
+
+ """
+
+ if ensembles is not None:
+ if not hasattr(ensembles, '__iter__'):
+ ensembles = [ensembles]
+
+ ensembles_list = ensembles
+ if not hasattr(ensembles[0], '__iter__'):
+ ensembles_list = [ensembles]
+
+ # Calculate merged ensembles and transfer to memory
+ merged_ensembles = []
+ for ensembles in ensembles_list:
+ # Transfer ensembles to memory
+ for ensemble in ensembles:
+ ensemble.transfer_to_memory()
+ merged_ensembles.append(merge_universes(ensembles))
+
+ methods = method
+ if not hasattr(method, '__iter__'):
+ methods = [method]
+
+ # Check whether any of the methods can make use of a distance matrix
+ any_method_accept_distance_matrix = \
+ np.any([_method.accepts_distance_matrix for _method in
+ methods])
+
+
+
+ # If distance matrices are provided, check that it matches the number
+ # of ensembles
+ if distance_matrix:
+ if not hasattr(distance_matrix, '__iter__'):
+ distance_matrix = [distance_matrix]
+ if ensembles is not None and \
+ len(distance_matrix) != len(merged_ensembles):
+ raise ValueError("Dimensions of provided list of distance matrices "
+ "does not match that of provided list of "
+ "ensembles: {0} vs {1}"
+ .format(len(distance_matrix),
+ len(merged_ensembles)))
+
+ else:
+ # Calculate distance matrices for all merged ensembles - if not provided
+ if any_method_accept_distance_matrix:
+ distance_matrix = []
+ for merged_ensemble in merged_ensembles:
+ distance_matrix.append(get_distance_matrix(merged_ensemble,
+ selection=selection,
+ **kwargs))
+
+ args = []
+ for method in methods:
+ if method.accepts_distance_matrix:
+ args += [(d,) for d in distance_matrix]
+ else:
+ for merged_ensemble in merged_ensembles:
+ coordinates = merged_ensemble.trajectory.timeseries(format="fac")
+
+ # Flatten coordinate matrix into n_frame x n_coordinates
+ coordinates = np.reshape(coordinates,
+ (coordinates.shape[0], -1))
+
+ args.append((coordinates,))
+
+ # Execute dimensionality reduction procedure
+ pc = ParallelCalculation(ncores, methods, args)
+
+ # Run parallel calculation
+ results = pc.run()
+
+ # Keep track of which sample belongs to which ensembles
+ details = {}
+ if ensembles is not None:
+ ensemble_assignment = []
+ for i, ensemble in enumerate(ensembles):
+ ensemble_assignment += [i+1]*len(ensemble.trajectory)
+ ensemble_assignment = np.array(ensemble_assignment)
+ details['ensemble_membership'] = ensemble_assignment
+
+ coordinates = []
+ for result in results:
+ coordinates.append(result[1][0])
+ # details.append(result[1][1])
+
+ if allow_collapsed_result and len(coordinates)==1:
+ coordinates = coordinates[0]
+ # details = details[0]
+
+ return coordinates, details
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/src/spe.c b/package/MDAnalysis/analysis/encore/dimensionality_reduction/src/spe.c
new file mode 100644
index 00000000000..ecd63b11cde
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/src/spe.c
@@ -0,0 +1,228 @@
+/*
+spe.c --- C implementation of the Stochastic Proximity Embedding algorithm and variations
+Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+*/
+
+#include
+#include
+#include
+//#include
+#include
+#include
+#include
+
+#define EPSILON 1e-8
+
+typedef struct {
+ int index;
+ double value;
+} IVWrapper;
+
+inline int trmIndex(int row, int col) { // array index for triangular matrix
+ return (row) > (col) ? ((row)+1)*(row)/2+(col) : ((col)+1)*(col)/2+(row);
+}
+
+void printarray(double* arr, int len) {
+ for (int i=0; ivalue == b->value)
+ return(0);
+ else {
+ if (a->value < b->value)
+ return(-1);
+ else
+ return(1);
+ }
+}
+
+// Euclidean distance
+double ed(double* d_coords, int i, int j, int dim) {
+ double d = 0.0;
+ double delta = 0.0;
+ int idxi = i*dim;
+ int idxj = j*dim;
+ int idxik = 0;
+ int idxjk = 0;
+
+ for (int k=0; k rco && dab < rab)) {
+ idxa = a * dim;
+ idxb = b * dim;
+ t = lam * 0.5 * (rab - dab) / (dab + EPSILON);
+
+ for (int k=0; k 0)
+ printf("Cycle %d - Residual stress: %.3f, lambda %.3f\n", i, neighbours_stress(s, d_coords, dim, nelem, rco),lam);
+ }
+ finalstress = neighbours_stress(s, d_coords, dim, nelem, rco);
+ //printf("Calculation finished. - Residual stress: %.3f\n", finalstress);
+ return(finalstress);
+}
+
+
diff --git a/package/MDAnalysis/analysis/encore/dimensionality_reduction/stochasticproxembed.pyx b/package/MDAnalysis/analysis/encore/dimensionality_reduction/stochasticproxembed.pyx
new file mode 100644
index 00000000000..8f3e979e66b
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/dimensionality_reduction/stochasticproxembed.pyx
@@ -0,0 +1,110 @@
+# cython: embedsignature=True
+# stochasticproxembed.pyx --- Cython wrapper for the stochastic proximity embedding C library
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+Cython wrapper for the C implementation of the Stochastic Proximity Embedding
+dimensionality reduction algorithm.
+
+:Author: Matteo Tiberti, Wouter Boomsma
+:Year: 2015--2016
+:Copyright: GNU Public License v3
+:Mantainer: Matteo Tiberti , mtiberti on github """
+
+import logging
+import numpy
+cimport numpy
+
+cimport cstochasticproxembed
+cimport cython
+
+
+@cython.embedsignature(True)
+
+cdef class StochasticProximityEmbedding:
+ """
+ Stochastic proximity embedding dimensionality reduction algorithm. The
+ algorithm implemented here is described in this paper:
+
+ Dmitrii N. Rassokhin, Dimitris K. Agrafiotis
+ A modified update rule for stochastic proximity embedding
+ Journal of Molecular Graphics and Modelling 22 (2003) 133–140
+
+ This class is a Cython wrapper for a C implementation (see spe.c)
+ """
+
+
+ def run(self, s, double rco, int dim, double maxlam, double minlam, int ncycle, int nstep, int stressfreq):
+ """
+ Run stochastic proximity embedding.
+
+ Parameters:
+ ----------
+
+ s : encore.utils.TriangularMatrix object
+ Triangular matrix containing the distance values for each pair of
+ elements in the original space.
+
+ rco : float
+ neighborhood distance cut-off
+
+ dim : int
+ number of dimensions for the embedded space
+
+ minlam : float
+ final learning parameter
+
+ maxlam : float
+ starting learning parameter
+
+ ncycle : int
+ number of cycles. Each cycle is composed of nstep steps. At the end
+ of each cycle, the lerning parameter lambda is updated.
+
+ nstep : int
+ number of coordinate update steps for each cycle
+
+
+
+ Returns
+ -------
+
+ space : (float, numpy.array)
+ float is the final stress obtained; the array are the coordinates of
+ the elements in the embedded space
+
+ stressfreq : int
+ calculate and report stress value every stressfreq cycle
+
+
+ """
+
+ cdef int nelem = s.size
+ cdef double finalstress = 0.0
+
+ logging.info("Starting Stochastic Proximity Embedding")
+
+ cdef numpy.ndarray[numpy.float64_t, ndim=1] matndarray = numpy.ascontiguousarray(s._elements, dtype=numpy.float64)
+ cdef numpy.ndarray[numpy.float64_t, ndim=1] d_coords = numpy.zeros((nelem*dim),dtype=numpy.float64)
+
+ finalstress = cstochasticproxembed.CStochasticProximityEmbedding( matndarray.data, d_coords.data, rco, nelem, dim, maxlam, minlam, ncycle, nstep, stressfreq)
+
+ logging.info("Stochastic Proximity Embedding finished. Residual stress: %.3f" % finalstress)
+
+ return (finalstress, d_coords.reshape((-1,dim)).T)
+
+ def __call__(self, *args):
+ return self.run(*args)
diff --git a/package/MDAnalysis/analysis/encore/similarity.py b/package/MDAnalysis/analysis/encore/similarity.py
new file mode 100644
index 00000000000..0b7f99263bd
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/similarity.py
@@ -0,0 +1,1737 @@
+# similarity.py --- Similarity measures between protein ensembles
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""
+Ensemble Similarity Calculations --- :mod:`MDAnalysis.analysis.encore.similarity`
+=================================================================================
+
+:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
+:Year: 2015-2016
+:Copyright: GNU Public License v3
+:Maintainer: Matteo Tiberti , mtiberti on github
+
+.. versionadded:: 0.16.0
+
+The module contains implementations of similarity measures between protein
+ensembles described in [Lindorff-Larsen2009]_. The implementation and examples
+are described in [Tiberti2015]_.
+
+The module includes facilities for handling ensembles and trajectories through
+the :class:`Universe` class, performing clustering or dimensionality reduction
+of the ensemble space, estimating multivariate probability distributions from
+the input data, and more. ENCORE can be used to compare experimental and
+simulation-derived ensembles, as well as estimate the convergence of
+trajectories from time-dependent simulations.
+
+ENCORE includes three different methods for calculations of similarity measures
+between ensembles implemented in individual functions:
+
+
+ + **Harmonic Ensemble Similarity** : :func:`hes`
+ + **Clustering Ensemble Similarity** : :func:`ces`
+ + **Dimensional Reduction Ensemble Similarity** : :func:`dres`
+
+When using this module in published work please cite [Tiberti2015]_.
+
+References
+----------
+
+ .. [Lindorff-Larsen2009] Similarity Measures for Protein Ensembles. Lindorff-Larsen, K. Ferkinghoff-Borg, J. PLoS ONE 2008, 4, e4203.
+
+ .. [Tiberti2015] ENCORE: Software for Quantitative Ensemble Comparison. Matteo Tiberti, Elena Papaleo, Tone Bengtsen, Wouter Boomsma, Kresten Lindorff- Larsen. PLoS Comput Biol. 2015, 11
+
+.. _Examples:
+Examples
+--------
+
+The examples show how to use ENCORE to calculate a similarity measurement
+of two simple ensembles. The ensembles are obtained from the MDAnalysis
+test suite for two different simulations of the protein AdK. To run the
+examples first execute: ::
+
+ >>> from MDAnalysis import Universe
+ >>> import MDAnalysis.analysis.encore as encore
+ >>> from MDAnalysis.tests.datafiles import PSF, DCD, DCD2
+
+
+To calculate the Harmonic Ensemble Similarity (:func:`hes`)
+two ensemble objects are first created and then used for calculation: ::
+
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> print encore.hes([ens1, ens2])
+ (array([[ 0. , 38279683.95892926],
+ [ 38279683.95892926, 0. ]]), None)
+
+Here None is returned in the array as the default details parameter is False.
+HES can assume any non-negative value, i.e. no upper bound exists and the
+measurement can therefore be used as an absolute scale.
+
+The calculation of the Clustering Ensemble Similarity (:func:`ces`)
+is computationally more expensive. It is based on clustering algorithms that in
+turn require a similarity matrix between the frames the ensembles are made
+of. The similarity matrix is derived from a distance matrix (By default a RMSD
+matrix; a full RMSD matrix between each pairs of elements needs to be computed).
+The RMSD matrix is automatically calculated. ::
+
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> CES, details = encore.ces([ens1, ens2])
+ >>> print CES
+ [[ 0. 0.68070702]
+ [ 0.68070702 0. ]]
+
+However, we may want to reuse the RMSD matrix in other calculations e.g.
+running CES with different parameters or running DRES. In this
+case we first compute the RMSD matrix alone:
+
+ >>> rmsd_matrix = encore.get_distance_matrix(\
+ encore.utils.merge_universes([ens1, ens2]),\
+ save_matrix="rmsd.npz")
+
+In the above example the RMSD matrix was also saved in rmsd.npz on disk, and
+so can be loaded and re-used at later times, instead of being recomputed:
+
+ >>> rmsd_matrix = encore.get_distance_matrix(
+ encore.utils.merge_universes([ens1, ens2]),
+ load_matrix="rmsd.npz")
+
+
+For instance, the rmsd_matrix object can be re-used as input for the
+Dimensional Reduction Ensemble Similarity (:func:`dres`) method.
+DRES is based on the estimation of the probability density in
+a dimensionally-reduced conformational space of the ensembles, obtained from
+the original space using either the Stochastic Proximity Embedding algorithm or
+the Principle Component Analysis.
+As the algorithms require the distance matrix calculated on the original space,
+we can reuse the previously-calculated RMSD matrix.
+In the following example the dimensions are reduced to 3 using the
+saved RMSD matrix and the default SPE dimensional reduction method. : ::
+
+ >>> DRES,details = encore.dres([ens1, ens2],\
+ distance_matrix = rmsd_matrix)
+ >>> print DRES
+ [[ 0. , 0.67453198]
+ [ 0.67453198, 0. ]]
+
+
+In addition to the quantitative similarity estimate, the dimensional reduction
+can easily be visualized, see the ``Example`` section in
+:mod:`MDAnalysis.analysis.encore.dimensionality_reduction.reduce_dimensionality`
+Due to the stochastic nature of SPE, two identical ensembles will not
+necessarily result in an exactly 0 estimate of the similarity, but will be very
+close. For the same reason, calculating the similarity with the :func:`dres`
+twice will not result in necessarily identical values but rather two very close
+values.
+
+It should be noted that both in :func:`ces` and :func:`dres` the similarity is
+evaluated using the Jensen-Shannon divergence resulting in an upper bound of
+ln(2), which indicates no similarity between the ensembles and a lower bound
+of 0.0 signifying two identical ensembles. Therefore using CES and DRES
+ensembles can be compared in a more relative sense than HES, i.e. they
+can be used to understand whether ensemble A is closer to ensemble B than
+ensemble C, but absolute values are less meaningful as they also depend on the
+chosen parameters.
+
+
+Functions
+---------
+
+.. autofunction:: hes
+
+.. autofunction:: ces
+
+.. autofunction:: dres
+
+
+
+"""
+from __future__ import print_function
+import MDAnalysis as mda
+import numpy as np
+import warnings
+import logging
+try:
+ from scipy.stats import gaussian_kde
+except ImportError:
+ gaussian_kde = None
+ msg = "scipy.stats.gaussian_kde could not be imported. " \
+ "Dimensionality reduction ensemble comparisons will not " \
+ "be available."
+ warnings.warn(msg,
+ category=ImportWarning)
+ logging.warn(msg)
+ del msg
+
+from ...coordinates.memory import MemoryReader
+from .confdistmatrix import get_distance_matrix
+from .bootstrap import (get_distance_matrix_bootstrap_samples,
+ get_ensemble_bootstrap_samples)
+from .clustering.cluster import cluster
+from .clustering.ClusteringMethod import AffinityPropagationNative
+from .dimensionality_reduction.DimensionalityReductionMethod import (
+ StochasticProximityEmbeddingNative)
+from .dimensionality_reduction.reduce_dimensionality import (
+ reduce_dimensionality)
+from .covariance import (
+ covariance_matrix, ml_covariance_estimator, shrinkage_covariance_estimator)
+from .utils import merge_universes
+from .utils import trm_indices_diag, trm_indices_nodiag
+
+# Low boundary value for log() argument - ensure no nans
+EPSILON = 1E-15
+
+xlogy = np.vectorize(
+ lambda x, y: 0.0 if (x <= EPSILON and y <= EPSILON) else x * np.log(y))
+
+
+def discrete_kullback_leibler_divergence(pA, pB):
+ """Kullback-Leibler divergence between discrete probability distribution.
+ Notice that since this measure is not symmetric ::
+ :math:`d_{KL}(p_A,p_B) != d_{KL}(p_B,p_A)`
+
+ Parameters
+ ----------
+
+ pA : iterable of floats
+ First discrete probability density function
+
+ pB : iterable of floats
+ Second discrete probability density function
+
+ Returns
+ -------
+
+ dkl : float
+ Discrete Kullback-Liebler divergence
+ """
+
+ return np.sum(xlogy(pA, pA / pB))
+
+
+# discrete dJS
+def discrete_jensen_shannon_divergence(pA, pB):
+ """Jensen-Shannon divergence between discrete probability distributions.
+
+ Parameters
+ ----------
+
+ pA : iterable of floats
+ First discrete probability density function
+
+ pB : iterable of floats
+ Second discrete probability density function
+
+ Returns
+ -------
+
+ djs : float
+ Discrete Jensen-Shannon divergence
+"""
+ return 0.5 * (discrete_kullback_leibler_divergence(pA, (pA + pB) * 0.5) +
+ discrete_kullback_leibler_divergence(pB, (pA + pB) * 0.5))
+
+
+# calculate harmonic similarity
+def harmonic_ensemble_similarity(sigma1,
+ sigma2,
+ x1,
+ x2):
+ """
+ Calculate the harmonic ensemble similarity measure
+ as defined in
+
+ Similarity Measures for Protein Ensembles. Lindorff-Larsen, K.;
+ Ferkinghoff-Borg, J. PLoS ONE 2009, 4, e4203.
+
+ Parameters
+ ----------
+
+ sigma1 : numpy.array
+ Covariance matrix for the first ensemble.
+
+ sigma2 : numpy.array
+ Covariance matrix for the second ensemble.
+
+ x1: numpy.array
+ Mean for the estimated normal multivariate distribution of the first
+ ensemble.
+
+ x2: numpy.array
+ Mean for the estimated normal multivariate distribution of the second
+ ensemble.
+
+ Returns
+ -------
+
+ dhes : float
+ harmonic similarity measure
+ """
+
+ # Inverse covariance matrices
+ sigma1_inv = np.linalg.pinv(sigma1)
+ sigma2_inv = np.linalg.pinv(sigma2)
+
+ # Difference between average vectors
+ d_avg = x1 - x2
+
+ # Distance measure
+ trace = np.trace(np.dot(sigma1, sigma2_inv) +
+ np.dot(sigma2, sigma1_inv)
+ - 2 * np.identity(sigma1.shape[0]))
+
+ d_hes = 0.25 * (np.dot(np.transpose(d_avg),
+ np.dot(sigma1_inv + sigma2_inv,
+ d_avg)) + trace)
+ return d_hes
+
+
+def clustering_ensemble_similarity(cc, ens1, ens1_id, ens2, ens2_id,
+ selection="name CA"):
+ """Clustering ensemble similarity: calculate the probability densities from
+ the clusters and calculate discrete Jensen-Shannon divergence.
+
+ Parameters
+ ----------
+
+ cc : encore.clustering.ClustersCollection
+ Collection from cluster calculated by a clustering algorithm
+ (e.g. Affinity propagation)
+
+ ens1 : :class:`~MDAnalysis.core.AtomGroup.Universe`
+ First ensemble to be used in comparison
+
+ ens1_id : int
+ First ensemble id as detailed in the ClustersCollection metadata
+
+ ens2 : :class:`~MDAnalysis.core.AtomGroup.Universe`
+ Second ensemble to be used in comparison
+
+ ens2_id : int
+ Second ensemble id as detailed in the ClustersCollection metadata
+
+ selection : str
+ Atom selection string in the MDAnalysis format. Default is "name CA".
+
+ Returns
+ -------
+
+ djs : float
+ Jensen-Shannon divergence between the two ensembles, as calculated by
+ the clustering ensemble similarity method
+ """
+ ens1_coordinates = ens1.trajectory.timeseries(ens1.select_atoms(selection),
+ format='fac')
+ ens2_coordinates = ens2.trajectory.timeseries(ens2.select_atoms(selection),
+ format='fac')
+ tmpA = np.array([np.where(c.metadata['ensemble_membership'] == ens1_id)[
+ 0].shape[0] / float(ens1_coordinates.shape[0]) for
+ c in cc])
+ tmpB = np.array([np.where(c.metadata['ensemble_membership'] == ens2_id)[
+ 0].shape[0] / float(ens2_coordinates.shape[0]) for
+ c in cc])
+
+ # Exclude clusters which have 0 elements in both ensembles
+ pA = tmpA[tmpA + tmpB > EPSILON]
+ pB = tmpB[tmpA + tmpB > EPSILON]
+
+ return discrete_jensen_shannon_divergence(pA, pB)
+
+
+def cumulative_clustering_ensemble_similarity(cc, ens1_id, ens2_id,
+ ens1_id_min=1, ens2_id_min=1):
+ """ Calculate clustering ensemble similarity between joined ensembles.
+ This means that, after clustering has been performed, some ensembles are
+ merged and the dJS is calculated between the probability distributions of
+ the two clusters groups. In particular, the two ensemble groups are defined
+ by their ensembles id: one of the two joined ensembles will comprise all
+ the ensembles with id [ens1_id_min, ens1_id], and the other ensembles will
+ comprise all the ensembles with id [ens2_id_min, ens2_id].
+
+ Parameters
+ ----------
+
+ cc : encore.ClustersCollection
+ Collection from cluster calculated by a clustering algorithm
+ (e.g. Affinity propagation)
+
+ ens1_id : int
+ First ensemble id as detailed in the ClustersCollection
+ metadata
+
+ ens2_id : int
+ Second ensemble id as detailed in the ClustersCollection
+ metadata
+
+ Returns
+ -------
+
+ djs : float
+ Jensen-Shannon divergence between the two ensembles, as
+ calculated by the clustering ensemble similarity method
+
+"""
+
+ ensA = [np.where(np.logical_and(
+ c.metadata['ensemble_membership'] <= ens1_id,
+ c.metadata['ensemble_membership'])
+ >= ens1_id_min)[0].shape[0] for c in cc]
+ ensB = [np.where(np.logical_and(
+ c.metadata['ensemble_membership'] <= ens2_id,
+ c.metadata['ensemble_membership'])
+ >= ens2_id_min)[0].shape[0] for c in cc]
+ sizeA = float(np.sum(ensA))
+ sizeB = float(np.sum(ensB))
+
+ tmpA = np.array(ensA) / sizeA
+ tmpB = np.array(ensB) / sizeB
+
+ # Exclude clusters which have 0 elements in both ensembles
+ pA = tmpA[tmpA + tmpB > EPSILON]
+ pB = tmpB[tmpA + tmpB > EPSILON]
+
+ return discrete_jensen_shannon_divergence(pA, pB)
+
+
+def gen_kde_pdfs(embedded_space, ensemble_assignment, nensembles,
+ nsamples):
+ """
+ Generate Kernel Density Estimates (KDE) from embedded spaces and
+ elaborate the coordinates for later use.
+
+ Parameters
+ ----------
+
+ embedded_space : numpy.array
+ Array containing the coordinates of the embedded space
+
+ ensemble_assignment : numpy.array
+ Array containing one int per ensemble conformation. These allow to
+ distinguish, in the complete embedded space, which conformations
+ belong to each ensemble. For instance if ensemble_assignment
+ is [1,1,1,1,2,2], it means that the first four conformations belong
+ to ensemble 1 and the last two to ensemble 2
+
+ nensembles : int
+ Number of ensembles
+
+ nsamples : int
+ samples to be drawn from the ensembles. Will be required in
+ a later stage in order to calculate dJS.
+
+ Returns
+ -------
+
+ kdes : scipy.stats.gaussian_kde
+ KDEs calculated from ensembles
+
+ resamples : list of numpy.array
+ For each KDE, draw samples according to the probability distribution
+ of the KDE mixture model
+
+ embedded_ensembles : list of numpy.array
+ List of numpy.array containing, each one, the elements of the
+ embedded space belonging to a certain ensemble
+ """
+ kdes = []
+ embedded_ensembles = []
+ resamples = []
+
+ if gaussian_kde is None:
+ # hack: if we are running with minimal dependencies then scipy was
+ # not imported and we have to bail here (see scipy import at top)
+ raise ImportError("For Kernel Density Estimation functionality you"
+ "need to import scipy")
+
+ for i in range(1, nensembles + 1):
+ this_embedded = embedded_space.transpose()[
+ np.where(np.array(ensemble_assignment) == i)].transpose()
+ embedded_ensembles.append(this_embedded)
+ kdes.append(gaussian_kde(
+ this_embedded))
+
+ # # Set number of samples
+ # if not nsamples:
+ # nsamples = this_embedded.shape[1] * 10
+
+ # Resample according to probability distributions
+ for this_kde in kdes:
+ resamples.append(this_kde.resample(nsamples))
+
+ return (kdes, resamples, embedded_ensembles)
+
+
+def dimred_ensemble_similarity(kde1, resamples1, kde2, resamples2,
+ ln_P1_exp_P1=None, ln_P2_exp_P2=None,
+ ln_P1P2_exp_P1=None, ln_P1P2_exp_P2=None):
+ """
+ Calculate the Jensen-Shannon divergence according the the
+ Dimensionality reduction method. In this case, we have continuous
+ probability densities, this we need to integrate over the measurable
+ space. The aim is to first calculate the Kullback-Liebler divergence, which
+ is defined as:
+
+ .. math::
+ D_{KL}(P(x) || Q(x)) = \\int_{-\\infty}^{\\infty}P(x_i) ln(P(x_i)/Q(x_i)) = \\langle{}ln(P(x))\\rangle{}_P - \\langle{}ln(Q(x))\\rangle{}_P
+
+ where the :math:`\\langle{}.\\rangle{}_P` denotes an expectation calculated
+ under the distribution P. We can, thus, just estimate the expectation
+ values of the components to get an estimate of dKL.
+ Since the Jensen-Shannon distance is actually more complex, we need to
+ estimate four expectation values:
+
+ .. math::
+ \\langle{}log(P(x))\\rangle{}_P
+
+ \\langle{}log(Q(x))\\rangle{}_Q
+
+ \\langle{}log(0.5*(P(x)+Q(x)))\\rangle{}_P
+
+ \\langle{}log(0.5*(P(x)+Q(x)))\\rangle{}_Q
+
+ Parameters
+ ----------
+
+ kde1 : scipy.stats.gaussian_kde
+ Kernel density estimation for ensemble 1
+
+ resamples1 : numpy.array
+ Samples drawn according do kde1. Will be used as samples to
+ calculate the expected values according to 'P' as detailed before.
+
+ kde2 : scipy.stats.gaussian_kde
+ Kernel density estimation for ensemble 2
+
+ resamples2 : numpy.array
+ Samples drawn according do kde2. Will be used as sample to
+ calculate the expected values according to 'Q' as detailed before.
+
+ ln_P1_exp_P1 : float or None
+ Use this value for :math:`\\langle{}log(P(x))\\rangle{}_P; if None,
+ calculate it instead
+
+ ln_P2_exp_P2 : float or None
+ Use this value for :math:`\\langle{}log(Q(x))\\rangle{}_Q`; if
+ None, calculate it instead
+
+ ln_P1P2_exp_P1 : float or None
+ Use this value for
+ :math:`\\langle{}log(0.5*(P(x)+Q(x)))\\rangle{}_P`;
+ if None, calculate it instead
+
+ ln_P1P2_exp_P2 : float or None
+ Use this value for
+ :math:`\\langle{}log(0.5*(P(x)+Q(x)))\\rangle{}_Q`;
+ if None, calculate it instead
+
+ Returns
+ -------
+ djs : float
+ Jensen-Shannon divergence calculated according to the dimensionality
+ reduction method
+
+ Args:
+ ln_P1P2_exp_P2:
+
+ """
+
+ if not ln_P1_exp_P1 and not ln_P2_exp_P2 and not ln_P1P2_exp_P1 and not \
+ ln_P1P2_exp_P2:
+ ln_P1_exp_P1 = np.average(np.log(kde1.evaluate(resamples1)))
+ ln_P2_exp_P2 = np.average(np.log(kde2.evaluate(resamples2)))
+ ln_P1P2_exp_P1 = np.average(np.log(
+ 0.5 * (kde1.evaluate(resamples1) + kde2.evaluate(resamples1))))
+ ln_P1P2_exp_P2 = np.average(np.log(
+ 0.5 * (kde1.evaluate(resamples2) + kde2.evaluate(resamples2))))
+
+ return 0.5 * (
+ ln_P1_exp_P1 - ln_P1P2_exp_P1 + ln_P2_exp_P2 - ln_P1P2_exp_P2)
+
+
+def cumulative_gen_kde_pdfs(embedded_space, ensemble_assignment, nensembles,
+ nsamples, ens_id_min=1, ens_id_max=None):
+ """
+ Generate Kernel Density Estimates (KDE) from embedded spaces and
+ elaborate the coordinates for later use. However, consider more than
+ one ensemble as the space on which the KDE will be generated. In
+ particular, will use ensembles with ID [ens_id_min, ens_id_max].
+
+
+ Parameters
+ ----------
+
+ embedded_space : numpy.array
+ Array containing the coordinates of the embedded space
+
+ ensemble_assignment : numpy.array
+ array containing one int per ensemble conformation. These allow
+ to distinguish, in the complete embedded space, which
+ conformations belong to each ensemble. For instance if
+ ensemble_assignment is [1,1,1,1,2,2], it means that the first
+ four conformations belong to ensemble 1 and the last two
+ to ensemble 2
+
+ nensembles : int
+ Number of ensembles
+
+ nsamples : int
+ Samples to be drawn from the ensembles. Will be required in a later
+ stage in order to calculate dJS.
+
+ ens_id_min : int
+ Minimum ID of the ensemble to be considered; see description
+
+ ens_id_max : int
+ Maximum ID of the ensemble to be considered; see description. If None,
+ it will be set to the maximum possible value given the number of
+ ensembles.
+
+ Returns
+ -------
+
+ kdes : scipy.stats.gaussian_kde
+ KDEs calculated from ensembles
+
+ resamples : list of numpy.array
+ For each KDE, draw samples according to the probability
+ distribution of the kde mixture model
+
+ embedded_ensembles : list of numpy.array
+ List of numpy.array containing, each one, the elements of the
+ embedded space belonging to a certain ensemble
+
+
+ """
+
+ if gaussian_kde is None:
+ # hack: if we are running with minimal dependencies then scipy was
+ # not imported and we have to bail here (see scipy import at top)
+ raise ImportError("For Kernel Density Estimation functionality you"
+ "need to import scipy")
+
+ kdes = []
+ embedded_ensembles = []
+ resamples = []
+ if not ens_id_max:
+ ens_id_max = nensembles + 1
+ for i in range(ens_id_min, ens_id_max):
+ this_embedded = embedded_space.transpose()[np.where(
+ np.logical_and(ensemble_assignment >= ens_id_min,
+ ensemble_assignment <= i))].transpose()
+ embedded_ensembles.append(this_embedded)
+ kdes.append(
+ gaussian_kde(this_embedded))
+
+ # Resample according to probability distributions
+ for this_kde in kdes:
+ resamples.append(this_kde.resample(nsamples))
+
+ return (kdes, resamples, embedded_ensembles)
+
+
+def write_output(matrix, base_fname=None, header="", suffix="",
+ extension="dat"):
+ """
+ Write output matrix with a nice format, to stdout and optionally a file.
+
+ Parameters
+ ----------
+
+ matrix : encore.utils.TriangularMatrix
+ Matrix containing the values to be printed
+
+ base_fname : str
+ Basic filename for output. If None, no files will be written, and
+ the matrix will be just printed on standard output
+
+ header : str
+ Text to be written just before the matrix
+
+ suffix : str
+ String to be concatenated to basename, in order to get the final
+ file name
+
+ extension : str
+ Extension for the output file
+
+ """
+
+ if base_fname is not None:
+ fname = base_fname + "-" + suffix + "." + extension
+ else:
+ fname = None
+ matrix.square_print(header=header, fname=fname)
+
+
+def prepare_ensembles_for_convergence_increasing_window(ensemble,
+ window_size,
+ selection="name CA"):
+ """
+ Generate ensembles to be fed to ces_convergence or dres_convergence
+ from a single ensemble. Basically, the different slices the algorithm
+ needs are generated here.
+
+ Parameters
+ ----------
+
+ ensemble : :class:`~MDAnalysis.core.AtomGroup.Universe` object
+ Input ensemble
+
+ window_size : int
+ size of the window (in number of frames) to be used
+
+ selection : str
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ Returns
+ -------
+
+ tmp_ensembles :
+ The original ensemble is divided into different ensembles, each being
+ a window_size-long slice of the original ensemble. The last
+ ensemble will be bigger if the length of the input ensemble
+ is not exactly divisible by window_size.
+
+ """
+
+ ens_size = ensemble.trajectory.timeseries(ensemble.select_atoms(selection),
+ format='fac').shape[0]
+
+ rest_slices = ens_size / window_size
+ residuals = ens_size % window_size
+ slices_n = [0]
+
+ tmp_ensembles = []
+
+ for rs in range(rest_slices - 1):
+ slices_n.append(slices_n[-1] + window_size)
+ slices_n.append(slices_n[-1] + residuals + window_size)
+
+ for s,sl in enumerate(slices_n[:-1]):
+ tmp_ensembles.append(mda.Universe(
+ ensemble.filename,
+ ensemble.trajectory.timeseries()
+ [:, slices_n[s]:slices_n[s + 1], :],
+ format=MemoryReader))
+
+ return tmp_ensembles
+
+
+def hes(ensembles,
+ selection="name CA",
+ cov_estimator="shrinkage",
+ mass_weighted=True,
+ align=False,
+ details=False,
+ estimate_error=False,
+ bootstrapping_samples=100,
+ calc_diagonal=False):
+ """
+
+ Calculates the Harmonic Ensemble Similarity (HES) between ensembles using
+ the symmetrized version of Kullback-Leibler divergence as described
+ in [Lindorff-Larsen2009]_.
+
+ Parameters
+ ----------
+
+ ensembles : list
+ List of Universe objects for similarity measurements.
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ cov_estimator : str, optional
+ Covariance matrix estimator method, either shrinkage, `shrinkage`,
+ or Maximum Likelyhood, `ml`. Default is shrinkage.
+
+ mass_weighted : bool, optional
+ Whether to perform mass-weighted covariance matrix estimation
+ (default is True).
+
+ align : bool, optional
+ Whether to align the ensembles before calculating their similarity.
+ Note: this changes the ensembles in-place, and will thus leave your
+ ensembles in an altered state.
+ (default is False)
+
+ details : bool, optional
+ Save the mean and covariance matrix for each
+ ensemble in a numpy array (default is False).
+
+ estimate_error : bool, optional
+ Whether to perform error estimation (default is False).
+
+ bootstrapping_samples : int, optional
+ Number of times the similarity matrix will be bootstrapped (default
+ is 100), only if estimate_error is True.
+
+ calc_diagonal : bool, optional
+ Whether to calculate the diagonal of the similarity scores
+ (i.e. the similarities of every ensemble against itself).
+ If this is False (default), 0.0 will be used instead.
+
+ Returns
+ -------
+
+ numpy.array (bidimensional)
+ Harmonic similarity measurements between each pair of ensembles.
+
+ Notes
+ -----
+
+ The method assumes that each ensemble is derived from a multivariate normal
+ distribution. The mean and covariance matrix are, thus, estimatated from
+ the distribution of each ensemble and used for comparision by the
+ symmetrized version of Kullback-Leibler divergence defined as:
+
+ .. math::
+ D_{KL}(P(x) || Q(x)) = \\int_{-\\infty}^{\\infty}P(x_i)
+ ln(P(x_i)/Q(x_i)) = \\langle{}ln(P(x))\\rangle{}_P -
+ \\langle{}ln(Q(x))\\rangle{}_P
+
+
+ where the :math:`\\langle{}.\\rangle{}_P` denotes an expectation
+ calculated under the distribution P.
+
+ For each ensemble, the mean conformation is estimated as the average over
+ the ensemble, and the covariance matrix is calculated by default using a
+ shrinkage estimation method (or by a maximum-likelihood method,
+ optionally).
+
+ In the Harmonic Ensemble Similarity measurement no upper bound exists and
+ the measurement can therefore be used for absolute comparison between
+ multiple ensembles.
+
+ When using this similarity measure, consider whether you want to align
+ the ensembles first (see example below)
+
+ Example
+ -------
+
+ To calculate the Harmonic Ensemble similarity, two ensembles are created
+ as Universe object from a topology file and two trajectories. The
+ topology- and trajectory files used are obtained from the MDAnalysis
+ test suite for two different simulations of the protein AdK. To run the
+ examples see the module `Examples`_ for how to import the files: ::
+
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> HES, details = encore.hes([ens1, ens2])
+ >>> print HES
+ [[ 0. 38279683.95892926]
+ [ 38279683.95892926 0. ]]
+
+
+ You can use the align=True option to align the ensembles first. This will
+ align everything to the current timestep in the first ensemble. Note that
+ this changes the ens1 and ens2 objects:
+
+ >>> print encore.hes([ens1, ens2], align=True)[0]
+ [[ 0. 6880.34140106]
+ [ 6880.34140106 0. ]]
+
+ Alternatively, for greater flexibility in how the alignment should be done
+ you can call the rms_fit_trj function manually:
+
+ >>> from MDAnalysis.analysis import align
+ >>> align.rms_fit_trj(ens1, ens1, select="name CA", in_memory=True)
+ >>> align.rms_fit_trj(ens2, ens1, select="name CA", in_memory=True)
+ >>> print encore.hes([ens1, ens2])[0]
+ [[ 0. 7032.19607004]
+ [ 7032.19607004 0. ]]
+ """
+
+ # Ensure in-memory trajectories either by calling align
+ # with in_memory=True or by directly calling transfer_to_memory
+ # on the universe.
+ if align:
+ for ensemble in ensembles:
+ mda.analysis.align.rms_fit_trj(ensemble, ensembles[0],
+ select=selection,
+ mass_weighted=True,
+ in_memory=True)
+ else:
+ for ensemble in ensembles:
+ ensemble.transfer_to_memory()
+
+ if calc_diagonal:
+ pairs_indices = list(trm_indices_diag(len(ensembles)))
+ else:
+ pairs_indices = list(trm_indices_nodiag(len(ensembles)))
+
+ logging.info("Chosen metric: Harmonic similarity")
+ if cov_estimator == "shrinkage":
+ covariance_estimator = shrinkage_covariance_estimator
+ logging.info(" Covariance matrix estimator: Shrinkage")
+ elif cov_estimator == "ml":
+ covariance_estimator = ml_covariance_estimator
+ logging.info(" Covariance matrix estimator: Maximum Likelihood")
+ else:
+ logging.error(
+ "Covariance estimator {0} is not supported. "
+ "Choose between 'shrinkage' and 'ml'.".format(cov_estimator))
+ return None
+
+ out_matrix_eln = len(ensembles)
+
+ xs = []
+ sigmas = []
+
+ if estimate_error:
+ data = []
+ ensembles_list = []
+ for i, ensemble in enumerate(ensembles):
+ ensembles_list.append(
+ get_ensemble_bootstrap_samples(
+ ensemble,
+ samples=bootstrapping_samples))
+ for t in range(bootstrapping_samples):
+ logging.info("The coordinates will be bootstrapped.")
+
+ xs = []
+ sigmas = []
+ values = np.zeros((out_matrix_eln, out_matrix_eln))
+ for i, e_orig in enumerate(ensembles):
+ xs.append(np.average(
+ ensembles_list[i][t].trajectory.timeseries(
+ e_orig.select_atoms(selection),
+ format=('fac')),
+ axis=0).flatten())
+ sigmas.append(covariance_matrix(ensembles_list[i][t],
+ mass_weighted=True,
+ estimator=covariance_estimator,
+ selection=selection))
+
+ for pair in pairs_indices:
+ value = harmonic_ensemble_similarity(x1=xs[pair[0]],
+ x2=xs[pair[1]],
+ sigma1=sigmas[pair[0]],
+ sigma2=sigmas[pair[1]])
+ values[pair[0], pair[1]] = value
+ values[pair[1], pair[0]] = value
+ data.append(values)
+ avgs = np.average(data, axis=0)
+ stds = np.std(data, axis=0)
+
+ return (avgs, stds)
+
+ # Calculate the parameters for the multivariate normal distribution
+ # of each ensemble
+ values = np.zeros((out_matrix_eln, out_matrix_eln))
+
+ for e in ensembles:
+
+ # Extract coordinates from each ensemble
+ coordinates_system = e.trajectory.timeseries(e.select_atoms(selection),
+ format='fac')
+
+ # Average coordinates in each system
+ xs.append(np.average(coordinates_system, axis=0).flatten())
+
+ # Covariance matrices in each system
+ sigmas.append(covariance_matrix(e,
+ mass_weighted=mass_weighted,
+ estimator=covariance_estimator,
+ selection=selection))
+
+ for i, j in pairs_indices:
+ value = harmonic_ensemble_similarity(x1=xs[i],
+ x2=xs[j],
+ sigma1=sigmas[i],
+ sigma2=sigmas[j])
+ values[i, j] = value
+ values[j, i] = value
+
+ # Save details as required
+ if details:
+ kwds = {}
+ for i in range(out_matrix_eln):
+ kwds['ensemble{0:d}_mean'.format(i + 1)] = xs[i]
+ kwds['ensemble{0:d}_covariance_matrix'.format(i + 1)] = sigmas[i]
+ details = np.array(kwds)
+
+ else:
+ details = None
+
+ return values, details
+
+
+def ces(ensembles,
+ selection="name CA",
+ clustering_method=AffinityPropagationNative(
+ preference=-1.0,
+ max_iter=500,
+ convergence_iter=50,
+ damping=0.9,
+ add_noise=True),
+ distance_matrix=None,
+ estimate_error=False,
+ bootstrapping_samples=10,
+ ncores=1,
+ calc_diagonal=False,
+ allow_collapsed_result=True):
+ """
+
+ Calculates the Clustering Ensemble Similarity (CES) between ensembles
+ using the Jensen-Shannon divergence as described in
+ [Lindorff-Larsen2009]_.
+
+
+ Parameters
+ ----------
+
+ ensembles : list
+ List of ensemble objects for similarity measurements
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ clustering_method :
+ A single or a list of instances of the
+ :class:`MDAnalysis.analysis.encore.clustering.ClusteringMethod` classes
+ from the clustering module. Different parameters for the same clustering
+ method can be explored by adding different instances of the same
+ clustering class. Clustering methods options are the
+ Affinity Propagation (default), the DBSCAN and the KMeans. The latter
+ two methods need the sklearn python module installed.
+
+ distance_matrix : encore.utils.TriangularMatrix
+ Distance matrix clustering methods. If this parameter
+ is not supplied the matrix will be calculated on the fly.
+
+ estimate_error : bool, optional
+ Whether to perform error estimation (default is False).
+ Only bootstrapping mode is supported.
+
+ bootstrapping_samples : int, optional
+ number of samples to be used for estimating error.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+ calc_diagonal : bool, optional
+ Whether to calculate the diagonal of the similarity scores
+ (i.e. the similarities of every ensemble against itself).
+ If this is False (default), 0.0 will be used instead.
+
+ allow_collapsed_result: bool, optional
+ Whether a return value of a list of one value should be collapsed
+ into just the value.
+
+
+
+ Returns
+ -------
+
+ ces, details : numpy.array, numpy.array
+
+ ces contains the similarity values, arranged in a numpy.array.
+ If only one clustering_method is provided the output will be a
+ 2-dimensional square symmetrical numpy.array. The order of the matrix
+ elements depends on the order of the input ensembles: for instance, if
+
+ ensemble = [ens1, ens2, ens3]
+
+ the matrix elements [0,2] and [2,0] will both contain the similarity
+ value between ensembles ens1 and ens3.
+ Elaborating on the previous example, if *n* ensembles are given and *m*
+ clustering_methods are provided the output will be a list of *m* arrays
+ ordered by the input sequence of methods, each with a *n*x*n*
+ symmetrical similarity matrix.
+
+ details contains information on the clustering: the individual size of
+ each cluster, the centroids and the frames associated with each cluster.
+
+
+ Notes
+ -----
+
+ In the Jensen-Shannon divergence the upper bound of ln(2) signifies
+ no similarity between the two ensembles, the lower bound, 0.0,
+ signifies identical ensembles.
+
+ To calculate the CES, the affinity propagation method (or others, if
+ specified) is used to partition the whole space of conformations. The
+ population of each ensemble in each cluster is then taken as a probability
+ density function. Different probability density functions from each
+ ensemble are finally compared using the Jensen-Shannon divergence measure.
+
+ Examples
+ --------
+ To calculate the Clustering Ensemble similarity, two ensembles are
+ created as Universe object using a topology file and two trajectories. The
+ topology- and trajectory files used are obtained from the MDAnalysis
+ test suite for two different simulations of the protein AdK. To run the
+ examples see the module `Examples`_ for how to import the files.
+ Here the simplest case of just two :class:`Ensemble`s used for comparison
+ are illustrated: ::
+
+ >>> ens1 = Universe(PSF, DCD)
+ >>> ens2 = Universe(PSF, DCD2)
+ >>> CES,details = encore.ces([ens1,ens2])
+ >>> print CES
+ [[ 0. 0.68070702]
+ [ 0.68070702 0. ]]
+
+ To use a different clustering method, set the parameter clustering_method
+ (OBS the sklearn module must be installed). Likewise, different parameters
+ for the same clustering method can be explored by adding different
+ instances of the same clustering class: ::
+ >>> CES, details = encore.ces([ens1,ens2],\
+ clustering_method = [encore.DBSCAN(eps=0.45),\
+ encore.DBSCAN(eps=0.50)])
+
+ >>> print "eps=0.45: \n", CES[0], "\n eps=0.5 : \n", CES[1]
+ eps=0.45:
+ [[ 0. 0.20447236]
+ [ 0.20447236 0. ]]
+ eps=0.5 :
+ [[ 0. 0.25331629]
+ [ 0.25331629 0. ]]
+
+
+
+ """
+
+ for ensemble in ensembles:
+ ensemble.transfer_to_memory()
+
+ if calc_diagonal:
+ pairs_indices = list(trm_indices_diag(len(ensembles)))
+ else:
+ pairs_indices = list(trm_indices_nodiag(len(ensembles)))
+
+ clustering_methods = clustering_method
+ if not hasattr(clustering_method, '__iter__'):
+ clustering_methods = [clustering_method]
+
+ any_method_accept_distance_matrix = \
+ np.any([method.accepts_distance_matrix for method in clustering_methods])
+ all_methods_accept_distance_matrix = \
+ np.all([method.accepts_distance_matrix for method in clustering_methods])
+
+ # Register which ensembles the samples belong to
+ ensemble_assignment = []
+ for i, ensemble in enumerate(ensembles):
+ ensemble_assignment += [i+1]*len(ensemble.trajectory)
+
+ # Calculate distance matrix if not provided
+ if any_method_accept_distance_matrix and not distance_matrix:
+ distance_matrix = get_distance_matrix(merge_universes(ensembles),
+ selection=selection,
+ ncores=ncores)
+ if estimate_error:
+ if any_method_accept_distance_matrix:
+ distance_matrix = \
+ get_distance_matrix_bootstrap_samples(
+ distance_matrix,
+ ensemble_assignment,
+ samples=bootstrapping_samples,
+ ncores=ncores)
+ if not all_methods_accept_distance_matrix:
+ ensembles_list = []
+ for i, ensemble in enumerate(ensembles):
+ ensembles_list.append(
+ get_ensemble_bootstrap_samples(
+ ensemble,
+ samples=bootstrapping_samples))
+ ensembles = []
+ for j in range(bootstrapping_samples):
+ ensembles.append([])
+ for i, e in enumerate(ensembles_list):
+ ensembles[-1].append(e[j])
+ else:
+ # if all methods accept distances matrices, duplicate
+ # ensemble so that it matches size of distance matrices
+ # (no need to resample them since they will not be used)
+ ensembles = [ensembles]*bootstrapping_samples
+
+
+ # Call clustering procedure
+ ccs = cluster(ensembles,
+ method= clustering_methods,
+ selection=selection,
+ distance_matrix = distance_matrix,
+ ncores = ncores,
+ allow_collapsed_result=False)
+
+ # Do error analysis
+ if estimate_error:
+ k = 0
+ values = {}
+ avgs = []
+ stds = []
+ for i, p in enumerate(clustering_methods):
+ failed_runs = 0
+ values[i] = []
+ for j in range(bootstrapping_samples):
+ if ccs[k].clusters is None:
+ failed_runs += 1
+ k += 1
+ continue
+ values[i].append(np.zeros((len(ensembles[j]),
+ len(ensembles[j]))))
+
+ for pair in pairs_indices:
+ # Calculate dJS
+ this_djs = \
+ clustering_ensemble_similarity(ccs[k],
+ ensembles[j][
+ pair[0]],
+ pair[0] + 1,
+ ensembles[j][
+ pair[1]],
+ pair[1] + 1,
+ selection=selection)
+ values[i][-1][pair[0], pair[1]] = this_djs
+ values[i][-1][pair[1], pair[0]] = this_djs
+ k += 1
+ outs = np.array(values[i])
+ avgs.append(np.average(outs, axis=0))
+ stds.append(np.std(outs, axis=0))
+
+ if hasattr(clustering_method, '__iter__'):
+ pass
+ else:
+ avgs = avgs[0]
+ stds = stds[0]
+
+ return avgs, stds
+
+ values = []
+ details = {}
+ for i, p in enumerate(clustering_methods):
+ if ccs[i].clusters is None:
+ continue
+ else:
+ values.append(np.zeros((len(ensembles), len(ensembles))))
+
+ for pair in pairs_indices:
+ # Calculate dJS
+ this_val = \
+ clustering_ensemble_similarity(ccs[i],
+ ensembles[pair[0]],
+ pair[0] + 1,
+ ensembles[pair[1]],
+ pair[1] + 1,
+ selection=selection)
+ values[-1][pair[0], pair[1]] = this_val
+ values[-1][pair[1], pair[0]] = this_val
+
+ details['clustering'] = ccs
+
+ if allow_collapsed_result and not hasattr(clustering_method, '__iter__'):
+ values = values[0]
+
+ return values, details
+
+
+def dres(ensembles,
+ selection="name CA",
+ dimensionality_reduction_method = StochasticProximityEmbeddingNative(
+ dimension=3,
+ distance_cutoff = 1.5,
+ min_lam=0.1,
+ max_lam=2.0,
+ ncycle=100,
+ nstep=10000),
+ distance_matrix=None,
+ nsamples=1000,
+ estimate_error=False,
+ bootstrapping_samples=100,
+ ncores=1,
+ calc_diagonal=False,
+ allow_collapsed_result=True):
+ """
+
+ Calculates the Dimensional Reduction Ensemble Similarity (DRES) between
+ ensembles using the Jensen-Shannon divergence as described in
+ [Lindorff-Larsen2009]_.
+
+
+ Parameters
+ ----------
+
+ ensembles : list
+ List of ensemble objects for similarity measurements
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ dimensionality_reduction_method :
+ A single or a list of instances of the DimensionalityReductionMethod
+ classes from the dimensionality_reduction module. Different parameters
+ for the same method can be explored by adding different instances of
+ the same dimensionality reduction class. Provided methods are the
+ Stochastic Proximity Embedding (default) and the Principel Component
+ Analysis.
+
+ distance_matrix : encore.utils.TriangularMatrix
+ conformational distance matrix, It will be calculated on the fly
+ from the ensemble data if it is not provided.
+
+ nsamples : int, optional
+ Number of samples to be drawn from the ensembles (default is 1000).
+ This is used to resample the density estimates and calculate the
+ Jensen-Shannon divergence between ensembles.
+
+ estimate_error : bool, optional
+ Whether to perform error estimation (default is False)
+
+ bootstrapping_samples : int, optional
+ number of samples to be used for estimating error.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+ calc_diagonal : bool, optional
+ Whether to calculate the diagonal of the similarity scores
+ (i.e. the simlarities of every ensemble against itself).
+ If this is False (default), 0.0 will be used instead.
+
+ allow_collapsed_result: bool, optional
+ Whether a return value of a list of one value should be collapsed
+ into just the value.
+
+
+ Returns
+ -------
+
+ dres, details : numpy.array, numpy.array
+ dres contains the similarity values, arranged in numpy.array.
+ If one number of dimensions is provided as an integer,
+ the output will be a 2-dimensional square symmetrical numpy.array.
+ The order of the matrix elements depends on the order of the
+ input ensemble: for instance, if
+
+ ensemble = [ens1, ens2, ens3]
+
+ then the matrix elements [0,2] and [2,0] will both contain the
+ similarity value between ensembles ens1 and ens3.
+ Elaborating on the previous example, if `n` ensembles are given and `m`
+ methods are provided the output will be a list of `m` arrays
+ ordered by the input sequence of methods, each with a `n`x`n`
+ symmetrical similarity matrix.
+
+ details provide an array of the reduced_coordinates.
+
+ Notes
+ -----
+
+ To calculate the similarity, the method first projects the ensembles into
+ lower dimensions by using the Stochastic Proximity Embedding (or others)
+ algorithm. A gaussian kernel-based density estimation method is then used
+ to estimate the probability density for each ensemble which is then used
+ to compute the Jensen-Shannon divergence between each pair of ensembles.
+
+ In the Jensen-Shannon divergence the upper bound of ln(2) signifies
+ no similarity between the two ensembles, the lower bound, 0.0,
+ signifies identical ensembles. However, due to the stochastic nature of
+ the dimensional reduction in :func:`dres`, two identical ensembles will
+ not necessarily result in an exact 0.0 estimate of the similarity but
+ will be very close. For the same reason, calculating the similarity with
+ the :func:`dres` twice will not result in two identical numbers; small
+ differences have to be expected.
+
+ Examples
+ --------
+
+ To calculate the Dimensional Reduction Ensemble similarity, two ensembles
+ are created as Universe objects from a topology file and two trajectories.
+ The topology- and trajectory files used are obtained from the MDAnalysis
+ test suite for two different simulations of the protein AdK. To run the
+ examples see the module `Examples`_ for how to import the files.
+ Here the simplest case of comparing just two :class:`Ensemble`s are
+ illustrated: ::
+
+
+ >>> ens1 = Universe(PSF,DCD)
+ >>> ens2 = Universe(PSF,DCD2)
+ >>> DRES, details = encore.dres([ens1,ens2])
+ >>> print DRES
+ [[ 0. 0.67996043]
+ [ 0.67996043 0. ]]
+
+ In addition to the quantitative similarity estimate, the dimensional
+ reduction can easily be visualized, see the ``Example`` section in
+ :mod:`MDAnalysis.analysis.encore.dimensionality_reduction.reduce_dimensionality``
+
+
+ To use a different dimensional reduction methods, simply set the
+ parameter dimensionality_reduction_method. Likewise, different parameters
+ for the same clustering method can be explored by adding different
+ instances of the same method class: ::
+ >>> DRES, details = encore.dres([ens1,ens2],\
+ dimensionality_reduction_method =\
+ encore.PrincipleComponentAnalysis(dimension=2))
+ >>> print DRES
+ [[ 0. 0.69314718]
+ [ 0.69314718 0. ]]
+ """
+
+ for ensemble in ensembles:
+ ensemble.transfer_to_memory()
+
+ if calc_diagonal:
+ pairs_indices = list(trm_indices_diag(len(ensembles)))
+ else:
+ pairs_indices = list(trm_indices_nodiag(len(ensembles)))
+
+ dimensionality_reduction_methods = dimensionality_reduction_method
+ if not hasattr(dimensionality_reduction_method, '__iter__'):
+ dimensionality_reduction_methods = [dimensionality_reduction_method]
+
+ any_method_accept_distance_matrix = \
+ np.any([method.accepts_distance_matrix for method in dimensionality_reduction_methods])
+ all_methods_accept_distance_matrix = \
+ np.all([method.accepts_distance_matrix for method in dimensionality_reduction_methods])
+
+ # Register which ensembles the samples belong to
+ ensemble_assignment = []
+ for i, ensemble in enumerate(ensembles):
+ ensemble_assignment += [i+1]*len(ensemble.trajectory)
+
+ # Calculate distance matrix if not provided
+ if any_method_accept_distance_matrix and not distance_matrix:
+ distance_matrix = get_distance_matrix(merge_universes(ensembles),
+ selection=selection,
+ ncores=ncores)
+ if estimate_error:
+ if any_method_accept_distance_matrix:
+ distance_matrix = \
+ get_distance_matrix_bootstrap_samples(
+ distance_matrix,
+ ensemble_assignment,
+ samples=bootstrapping_samples,
+ ncores=ncores)
+ if not all_methods_accept_distance_matrix:
+ ensembles_list = []
+ for i, ensemble in enumerate(ensembles):
+ ensembles_list.append(
+ get_ensemble_bootstrap_samples(
+ ensemble,
+ samples=bootstrapping_samples))
+ ensembles = []
+ for j in range(bootstrapping_samples):
+ ensembles.append(ensembles_list[i, j] for i
+ in range(ensembles_list.shape[0]))
+ else:
+ # if all methods accept distances matrices, duplicate
+ # ensemble so that it matches size of distance matrices
+ # (no need to resample them since they will not be used)
+ ensembles = [ensembles] * bootstrapping_samples
+
+ # Call dimensionality reduction procedure
+ coordinates, dim_red_details = reduce_dimensionality(
+ ensembles,
+ method=dimensionality_reduction_methods,
+ selection=selection,
+ distance_matrix = distance_matrix,
+ ncores = ncores,
+ allow_collapsed_result = False)
+
+ details = {}
+ details["reduced_coordinates"] = coordinates
+ details["dimensionality_reduction_details"] = details
+
+ if estimate_error:
+ k = 0
+ values = {}
+ avgs = []
+ stds = []
+ for i,method in enumerate(dimensionality_reduction_methods):
+ values[i] = []
+ for j in range(bootstrapping_samples):
+
+ values[i].append(np.zeros((len(ensembles[j]),
+ len(ensembles[j]))))
+
+ kdes, resamples, embedded_ensembles = gen_kde_pdfs(
+ coordinates[k],
+ ensemble_assignment,
+ len(ensembles[j]),
+ nsamples=nsamples)
+
+ for pair in pairs_indices:
+ this_value = dimred_ensemble_similarity(kdes[pair[0]],
+ resamples[pair[0]],
+ kdes[pair[1]],
+ resamples[pair[1]])
+ values[i][-1][pair[0], pair[1]] = this_value
+ values[i][-1][pair[1], pair[0]] = this_value
+
+ k += 1
+ outs = np.array(values[i])
+ avgs.append(np.average(outs, axis=0))
+ stds.append(np.std(outs, axis=0))
+
+ if hasattr(dimensionality_reduction_method, '__iter__'):
+ pass
+ else:
+ avgs = avgs[0]
+ stds = stds[0]
+
+ return avgs, stds
+
+ values = []
+
+ for i,method in enumerate(dimensionality_reduction_methods):
+
+ values.append(np.zeros((len(ensembles), len(ensembles))))
+ kdes, resamples, embedded_ensembles = gen_kde_pdfs(coordinates[i],
+ ensemble_assignment,
+ len(ensembles),
+ nsamples=nsamples)
+
+ for pair in pairs_indices:
+ this_value = dimred_ensemble_similarity(kdes[pair[0]],
+ resamples[pair[0]],
+ kdes[pair[1]],
+ resamples[pair[1]])
+ values[-1][pair[0], pair[1]] = this_value
+ values[-1][pair[1], pair[0]] = this_value
+
+ if allow_collapsed_result and not hasattr(dimensionality_reduction_method,
+ '__iter__'):
+ values = values[0]
+
+ return values, details
+
+
+def ces_convergence(original_ensemble,
+ window_size,
+ selection="name CA",
+ clustering_method=AffinityPropagationNative(
+ preference=-1.0,
+ max_iter=500,
+ convergence_iter=50,
+ damping=0.9,
+ add_noise=True),
+ ncores=1):
+ """
+ Use the CES to evaluate the convergence of the ensemble/trajectory.
+ CES will be calculated between the whole trajectory contained in an
+ ensemble and windows of such trajectory of increasing sizes, so that
+ the similarity values should gradually drop to zero. The rate at which
+ the value reach zero will be indicative of how much the trajectory
+ keeps on resampling the same regions of the conformational space, and
+ therefore of convergence.
+
+
+ Parameters
+ ----------
+
+ original_ensemble : :class:`~MDAnalysis.core.AtomGroup.Universe` object
+ ensemble containing the trajectory whose convergence has to estimated
+
+ window_size : int
+ Size of window to be used, in number of frames
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ clustering_method : MDAnalysis.analysis.encore.clustering.ClusteringMethod
+ A single or a list of instances of the ClusteringMethod classes from
+ the clustering module. Different parameters for the same clustering
+ method can be explored by adding different instances of the same
+ clustering class.
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+
+ Returns
+ -------
+
+ out : np.array
+ array of shape (number_of_frames / window_size, preference_values).
+
+
+ Example
+ --------
+ To calculate the convergence of a trajectory using the clustering ensemble
+ similarity method a Universe object is created from a topology file and the
+ trajectory. The topology- and trajectory files used are obtained from the
+ MDAnalysis test suite for two different simulations of the protein AdK.
+ To run the examples see the module `Examples`_ for how to import the files.
+ Here the simplest case of evaluating the convergence is illustrated by
+ splitting the trajectory into a window_size of 10 frames : ::
+
+
+ >>> ens1 = Universe(PSF,DCD)
+ >>> ces_conv = encore.ces_convergence(ens1, 10)
+ >>> print ces_conv
+ [[ 0.48194205]
+ [ 0.40284672]
+ [ 0.31699026]
+ [ 0.25220447]
+ [ 0.19829817]
+ [ 0.14642725]
+ [ 0.09911411]
+ [ 0.05667391]
+ [ 0. ]]
+
+
+
+ """
+
+ ensembles = prepare_ensembles_for_convergence_increasing_window(
+ original_ensemble, window_size, selection=selection)
+
+ ccs = cluster(ensembles,
+ selection=selection,
+ method=clustering_method,
+ allow_collapsed_result=False,
+ ncores=ncores)
+
+ out = []
+ for cc in ccs:
+ if cc.clusters is None:
+ continue
+ out.append(np.zeros(len(ensembles)))
+ for j, ensemble in enumerate(ensembles):
+ out[-1][j] = cumulative_clustering_ensemble_similarity(
+ cc,
+ len(ensembles),
+ j + 1)
+
+ out = np.array(out).T
+ return out
+
+
+def dres_convergence(original_ensemble,
+ window_size,
+ selection="name CA",
+ dimensionality_reduction_method = \
+ StochasticProximityEmbeddingNative(
+ dimension=3,
+ distance_cutoff=1.5,
+ min_lam=0.1,
+ max_lam=2.0,
+ ncycle=100,
+ nstep=10000
+ ),
+ nsamples=1000,
+ ncores=1):
+ """
+ Use the DRES to evaluate the convergence of the ensemble/trajectory.
+ DRES will be calculated between the whole trajectory contained in an
+ ensemble and windows of such trajectory of increasing sizes, so that
+ the similarity values should gradually drop to zero. The rate at which
+ the value reach zero will be indicative of how much the trajectory
+ keeps on resampling the same ares of the conformational space, and
+ therefore of convergence.
+
+ Parameters
+ ----------
+
+ original_ensemble : :class:`~MDAnalysis.core.AtomGroup.Universe` object
+ ensemble containing the trajectory whose convergence has to estimated
+
+ window_size : int
+ Size of window to be used, in number of frames
+
+ selection : str, optional
+ Atom selection string in the MDAnalysis format. Default is "name CA"
+
+ dimensionality_reduction_method :
+ A single or a list of instances of the DimensionalityReductionMethod
+ classes from the dimensionality_reduction module. Different parameters
+ for the same method can be explored by adding different instances of
+ the same dimensionality reduction class.
+
+ nsamples : int, optional
+ Number of samples to be drawn from the ensembles (default is 1000).
+ This is akin to the nsamples parameter of dres().
+
+ ncores : int, optional
+ Maximum number of cores to be used (default is 1).
+
+
+ Returns
+ -------
+
+ out : np.array
+ array of shape (number_of_frames / window_size, preference_values).
+
+
+
+ Example
+ --------
+ To calculate the convergence of a trajectory using the DRES
+ method, a Universe object is created from a topology file and the
+ trajectory. The topology- and trajectory files used are obtained from the
+ MDAnalysis test suite for two different simulations of the protein AdK.
+ To run the examples see the module `Examples`_ for how to import the files.
+ Here the simplest case of evaluating the convergence is illustrated by
+ splitting the trajectory into a window_size of 10 frames : ::
+
+
+ >>> ens1 = Universe(PSF,DCD)
+ >>> dres_conv = encore.dres_convergence(ens1, 10)
+ >>> print dres_conv
+ [[ 0.5295528 ]
+ [ 0.40716539]
+ [ 0.31158669]
+ [ 0.25314041]
+ [ 0.20447271]
+ [ 0.13212364]
+ [ 0.06979114]
+ [ 0.05214759]
+ [ 0. ]]
+
+ Here, the rate at which the values reach zero will be indicative of how
+ much the trajectory keeps on resampling the same ares of the conformational
+ space, and therefore of convergence.
+ """
+
+ ensembles = prepare_ensembles_for_convergence_increasing_window(
+ original_ensemble, window_size, selection=selection)
+
+ coordinates, dimred_details = \
+ reduce_dimensionality(
+ ensembles,
+ selection=selection,
+ method=dimensionality_reduction_method,
+ allow_collapsed_result=False,
+ ncores=ncores)
+
+ ensemble_assignment = []
+ for i, ensemble in enumerate(ensembles):
+ ensemble_assignment += [i+1]*len(ensemble.trajectory)
+ ensemble_assignment = np.array(ensemble_assignment)
+
+ out = []
+ for i, _ in enumerate(coordinates):
+
+ out.append(np.zeros(len(ensembles)))
+
+ kdes, resamples, embedded_ensembles = \
+ cumulative_gen_kde_pdfs(
+ coordinates[i],
+ ensemble_assignment=ensemble_assignment,
+ nensembles=len(ensembles),
+ nsamples=nsamples)
+
+ for j, ensemble in enumerate(ensembles):
+ out[-1][j] = dimred_ensemble_similarity(kdes[-1],
+ resamples[-1],
+ kdes[j],
+ resamples[j])
+
+ out = np.array(out).T
+ return out
diff --git a/package/MDAnalysis/analysis/encore/utils.py b/package/MDAnalysis/analysis/encore/utils.py
new file mode 100644
index 00000000000..e4e72415279
--- /dev/null
+++ b/package/MDAnalysis/analysis/encore/utils.py
@@ -0,0 +1,485 @@
+# utils.py --- Utility functions for encore package
+# Copyright (C) 2014 Wouter Boomsma, Matteo Tiberti
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+from multiprocessing.sharedctypes import SynchronizedArray
+from multiprocessing import Process, Manager
+import numpy as np
+import sys
+import MDAnalysis as mda
+from ...coordinates.memory import MemoryReader
+
+class TriangularMatrix(object):
+ """Triangular matrix class. This class is designed to provide a
+ memory-efficient representation of a triangular matrix that still behaves
+ as a square symmetric one. The class wraps a numpy.array object,
+ in which data are memorized in row-major order. It also has few additional
+ facilities to conveniently load/write a matrix from/to file. It can be
+ accessed using the [] and () operators, similarly to a normal numpy array.
+
+ Attributes:
+ -----------
+
+ `size` : int
+ Size of the matrix (number of rows or number of columns)
+
+ `metadata` : dict
+ Metadata for the matrix (date of creation, name of author ...)
+ """
+
+ def __init__(self, size, metadata=None, loadfile=None):
+ """Class constructor.
+
+ Attributes
+ ----------
+
+ `size` : int or multiprocessing.SyncrhonizeArray
+ Size of the matrix (number of rows or columns). If an array is
+ provided instead, the size of the triangular matrix will be
+ calculated and the array copied as the matrix elements. Otherwise,
+ the matrix is just initialized to zero.
+
+ `metadata` : dict or None
+ Metadata dictionary. Used to generate the metadata attribute.
+
+ `loadfile` : str or None
+ Load the matrix from this file. All the attributes and data will
+ be determined by the matrix file itself (i.e. metadata will be
+ ignored); size has to be provided though.
+ """
+ self.metadata = metadata
+ self.size = size
+ if loadfile:
+ self.loadz(loadfile)
+ return
+ if type(size) == int:
+ self.size = size
+ self._elements = np.zeros((size + 1) * size / 2, dtype=np.float64)
+ return
+ if type(size) == SynchronizedArray:
+ self._elements = np.array(size.get_obj(), dtype=np.float64)
+ self.size = int((np.sqrt(1 + 8 * len(size)) - 1) / 2)
+ return
+ else:
+ raise TypeError
+
+ def __getitem__(self, args):
+ x, y = args
+ if x < y:
+ x, y = y, x
+ return self._elements[x * (x + 1) / 2 + y]
+
+ def __setitem__(self, args, val):
+ x, y = args
+ if x < y:
+ x, y = y, x
+ self._elements[x * (x + 1) / 2 + y] = val
+
+ def as_array(self):
+ """Return standard numpy array equivalent"""
+ a = np.zeros((self.size, self.size))
+ a[np.tril_indices(self.size)] = self._elements
+ a[np.triu_indices(self.size)] = a.T[np.triu_indices(self.size)]
+ return a
+
+ def savez(self, fname):
+ """Save matrix in the npz compressed numpy format. Save metadata and
+ data as well.
+
+ Parameters
+ ----------
+
+ `fname` : str
+ Name of the file to be saved.
+ """
+ np.savez(fname, elements=self._elements, metadata=self.metadata)
+
+ def loadz(self, fname):
+ """Load matrix from the npz compressed numpy format.
+
+ Parameters
+ ----------
+
+ `fname` : str
+ Name of the file to be loaded.
+ """
+ loaded = np.load(fname)
+
+ if loaded['metadata'].shape != ():
+ if loaded['metadata']['number of frames'] != self.size:
+ raise TypeError
+ self.metadata = loaded['metadata']
+ else:
+ if self.size*(self.size-1)/2+self.size != len(loaded['elements']):
+ raise TypeError
+ self._elements = loaded['elements']
+
+ def __add__(self, scalar):
+ """Add scalar to matrix elements.
+
+ Parameters
+ ----------
+
+ `scalar` : float
+ Scalar to be added.
+ """
+ newMatrix = self.__class__(self.size)
+ newMatrix._elements = self._elements + scalar;
+ return newMatrix
+
+ def __iadd__(self, scalar):
+ """Add scalar to matrix elements.
+
+ Parameters
+ ----------
+
+ `scalar` : float
+ Scalar to be added.
+ """
+ self._elements += scalar
+ return self
+
+
+ def __mul__(self, scalar):
+ """Multiply with scalar.
+
+ Parameters
+ ----------
+
+ `scalar` : float
+ Scalar to multiply with.
+ """
+ newMatrix = self.__class__(self.size)
+ newMatrix._elements = self._elements * scalar;
+ return newMatrix
+
+ def __imul__(self, scalar):
+ """Multiply with scalar.
+
+ Parameters
+ ----------
+
+ `scalar` : float
+ Scalar to multiply with.
+ """
+ self._elements *= scalar
+ return self
+
+
+
+
+ __rmul__ = __mul__
+
+ def __str__(self):
+ return str(self.as_array())
+
+
+class ParallelCalculation(object):
+ """
+ Generic parallel calculation class. Can use arbitrary functions,
+ arguments to functions and kwargs to functions.
+
+ Attributes
+ ----------
+
+ `ncores` : int
+ Number of cores to be used for parallel calculation
+
+ `function` : callable object
+ Function to be run in parallel.
+
+ `args` : list of tuples
+ Each tuple contains the arguments that will be passed to
+ function(). This means that a call to function() is performed for
+ each tuple. function is called as function(*args, **kwargs). Runs
+ are distributed on the requested numbers of cores.
+
+ `kwargs` : list of dicts
+ Each tuple contains the named arguments that will be passed to
+ function, similarly as described for the args attribute.
+
+ `nruns` : int
+ Number of runs to be performed. Must be equal to len(args) and
+ len(kwargs).
+ """
+
+ def __init__(self, ncores, function, args=None, kwargs=None):
+ """ Class constructor.
+
+ Parameters
+ ----------
+
+ `ncores` : int
+ Number of cores to be used for parallel calculation
+
+ `function` : object that supports __call__, as functions
+ function to be run in parallel.
+
+ `args` : list of tuples
+ Arguments for function; see the ParallelCalculation class
+ description.
+
+ `kwargs` : list of dicts or None
+ kwargs for function; see the ParallelCalculation
+ class description.
+ """
+
+ # args[i] should be a list of args, one for each run
+ self.ncores = ncores
+ self.functions = function
+ if not hasattr(self.functions, '__iter__'):
+ self.functions = [self.functions]*len(args)
+ if len(self.functions) != len(args):
+ self.functions = self.functions[:]*(len(args)/len(self.functions))
+
+ # Arguments should be present
+ if args is None:
+ args = []
+ self.args = args
+
+ # If kwargs are not present, use empty dicts
+ if kwargs:
+ self.kwargs = kwargs
+ else:
+ self.kwargs = [{} for i in self.args]
+
+ self.nruns = len(args)
+
+ def worker(self, q, results):
+ """
+ Generic worker. Will run function with the prescribed args and kwargs.
+
+ Parameters
+ ----------
+
+ `q` : multiprocessing.Manager.Queue object
+ work queue, from which the worker fetches arguments and
+ messages
+
+ `results` : multiprocessing.Manager.Queue object
+ results queue, where results are put after each calculation is
+ finished
+
+ """
+ while True:
+ i = q.get()
+ if i == 'STOP':
+ return
+
+ results.put((i, self.functions[i](*self.args[i], **self.kwargs[i])))
+
+ def run(self):
+ """
+ Run parallel calculation.
+
+ Returns
+ -------
+
+ `results` : tuple of ordered tuples (int, object)
+ int is the number of the calculation corresponding to a
+ certain argument in the args list, and object is the result of
+ corresponding calculation. For instance, in (3, output), output
+ is the return of function(\*args[3], \*\*kwargs[3]).
+ """
+ results_list = []
+ if self.ncores == 1:
+ for i in range(self.nruns):
+ results_list.append((i, self.functions[i](*self.args[i],
+ **self.kwargs[i])))
+ else:
+ manager = Manager()
+ q = manager.Queue()
+ results = manager.Queue()
+
+ workers = [Process(target=self.worker, args=(q, results)) for i in
+ range(self.ncores)]
+
+ for i in range(self.nruns):
+ q.put(i)
+ for w in workers:
+ q.put('STOP')
+
+ for w in workers:
+ w.start()
+
+ for w in workers:
+ w.join()
+
+ results.put('STOP')
+ for i in iter(results.get, 'STOP'):
+ results_list.append(i)
+
+ return tuple(sorted(results_list, key=lambda x: x[0]))
+
+
+class ProgressBar(object):
+ """Handle and draw a progress barr.
+ From https://github.com/ikame/progressbar
+ """
+
+ def __init__(self, start=0, end=10, width=12, fill='=', blank='.',
+ format='[%(fill)s>%(blank)s] %(progress)s%%',
+ incremental=True):
+ super(ProgressBar, self).__init__()
+
+ self.start = start
+ self.end = end
+ self.width = width
+ self.fill = fill
+ self.blank = blank
+ self.format = format
+ self.incremental = incremental
+ self.step = 100 / float(width) # fix
+ self.reset()
+
+ def __add__(self, increment):
+ increment = self._get_progress(increment)
+ if 100 > self.progress + increment:
+ self.progress += increment
+ else:
+ self.progress = 100
+ return self
+
+ def __str__(self):
+ progressed = int(self.progress / self.step) # fix
+ fill = progressed * self.fill
+ blank = (self.width - progressed) * self.blank
+ return self.format % {'fill': fill, 'blank': blank,
+ 'progress': int(self.progress)}
+
+ __repr__ = __str__
+
+ def _get_progress(self, increment):
+ return float(increment * 100) / self.end
+
+ def reset(self):
+ """Resets the current progress to the start point"""
+ self.progress = self._get_progress(self.start)
+ return self
+
+ def update(self, progress):
+ """Update the progress value instead of incrementing it"""
+ this_progress = self._get_progress(progress)
+ if this_progress < 100:
+ self.progress = this_progress
+ else:
+ self.progress = 100
+
+
+class AnimatedProgressBar(ProgressBar):
+ """Extends ProgressBar to allow you to use it straighforward on a script.
+ Accepts an extra keyword argument named `stdout`
+ (by default use sys.stdout).
+ The progress status may be send to any file-object.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super(AnimatedProgressBar, self).__init__(*args, **kwargs)
+ self.stdout = kwargs.get('stdout', sys.stdout)
+
+ def show_progress(self):
+ if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
+ self.stdout.write('\r')
+ else:
+ self.stdout.write('\n')
+ self.stdout.write(str(self))
+ self.stdout.flush()
+
+
+def trm_indeces(a, b):
+ """
+ Generate (i,j) indeces of a triangular matrix, between elements a and b.
+ The matrix size is automatically determined from the number of elements.
+ For instance: trm_indeces((0,0),(2,1)) yields (0,0) (1,0) (1,1) (2,0)
+ (2,1).
+
+ Parameters
+ ----------
+
+ `a` : (int i, int j) tuple
+ starting matrix element.
+
+ `b` : (int i, int j) tuple
+ final matrix element.
+ """
+ i, j = a
+ while i < b[0]:
+ if i == j:
+ yield (i, j)
+ j = 0
+ i += 1
+ else:
+ yield (i, j)
+ j += 1
+ while j <= b[1]:
+ yield (i, j)
+ j += 1
+
+
+def trm_indices_nodiag(n):
+ """generate (i,j) indeces of a triangular matrix of n rows (or columns),
+ without diagonal (e.g. no elements (0,0),(1,1),...,(n,n))
+
+ Parameters
+ ----------
+
+ `n` : int
+ Matrix size
+"""
+
+ for i in xrange(1, n):
+ for j in xrange(i):
+ yield (i, j)
+
+
+def trm_indices_diag(n):
+ """generate (i,j) indeces of a triangular matrix of n rows (or columns),
+ with diagonal
+
+ Parameters
+ ----------
+
+ `n` : int
+ Matrix size
+"""
+
+ for i in xrange(0, n):
+ for j in xrange(i+1):
+ yield (i, j)
+
+
+def merge_universes(universes):
+ """
+ Merge list of universes into one
+
+ Parameters
+ ----------
+ `universes` : list of Universe objects
+
+
+ Returns
+ ----------
+ Universe object
+ """
+
+ for universe in universes:
+ universe.transfer_to_memory()
+
+ return mda.Universe(
+ universes[0].filename,
+ np.concatenate(tuple([e.trajectory.timeseries() for e in universes]),
+ axis=1),
+ format=MemoryReader)
diff --git a/package/MDAnalysis/lib/src/dimensionality_reduction/stochasticproxembed.h b/package/MDAnalysis/lib/src/dimensionality_reduction/stochasticproxembed.h
new file mode 100755
index 00000000000..e5d58061cf6
--- /dev/null
+++ b/package/MDAnalysis/lib/src/dimensionality_reduction/stochasticproxembed.h
@@ -0,0 +1,27 @@
+int trmIndex(int, int);
+
+double ed(double*, int, int, int);
+
+double stress(double, double, int, int);
+
+int neighbours(double, int, double, int*, int*, int*);
+
+int cmp_ivwrapper(const void*, const void*);
+
+int nearest_neighbours(double*, int, int);
+
+double CStochasticProximityEmbedding(
+ double*,
+ double*,
+ double,
+ int,
+ int,
+ double,
+ double,
+ int,
+ int,
+ int);
+
+
+
+
diff --git a/package/setup.py b/package/setup.py
index 834698e1966..c483e22bda6 100755
--- a/package/setup.py
+++ b/package/setup.py
@@ -282,6 +282,7 @@ def extensions(config):
source_suffix = '.pyx' if use_cython else '.c'
# The callable is passed so that it is only evaluated at install time.
+
include_dirs = [get_numpy_include]
dcd = MDAExtension('coordinates._dcdmodule',
@@ -332,8 +333,24 @@ def extensions(config):
sources=['MDAnalysis/lib/formats/cython_util' + source_suffix],
include_dirs=include_dirs)
+ encore_utils = MDAExtension('analysis.encore.cutils',
+ sources = ['MDAnalysis/analysis/encore/cutils' + source_suffix],
+ include_dirs = include_dirs,
+ extra_compile_args = ["-O3", "-ffast-math"])
+ ap_clustering = MDAExtension('analysis.encore.clustering.affinityprop',
+ sources = ['MDAnalysis/analysis/encore/clustering/affinityprop' + source_suffix, 'MDAnalysis/analysis/encore/clustering/src/ap.c'],
+ include_dirs = include_dirs+['MDAnalysis/analysis/encore/clustering/include'],
+ libraries=["m"],
+ extra_compile_args=["-O3", "-ffast-math","-std=c99"])
+ spe_dimred = MDAExtension('analysis.encore.dimensionality_reduction.stochasticproxembed',
+ sources = ['MDAnalysis/analysis/encore/dimensionality_reduction/stochasticproxembed' + source_suffix, 'MDAnalysis/analysis/encore/dimensionality_reduction/src/spe.c'],
+ include_dirs = include_dirs+['MDAnalysis/analysis/encore/dimensionality_reduction/include'],
+ libraries=["m"],
+ extra_compile_args=["-O3", "-ffast-math","-std=c99"])
pre_exts = [dcd, dcd_time, distances, distances_omp, qcprot,
- transformation, libmdaxdr, util]
+ transformation, libmdaxdr, util, encore_utils,
+ ap_clustering, spe_dimred]
+
cython_generated = []
if use_cython:
extensions = cythonize(pre_exts)
@@ -492,6 +509,8 @@ def dynamic_author_list():
'scipy',
'seaborn', # for annotated heat map and nearest neighbor
# plotting in PSA
+ 'sklearn', # For clustering and dimensionality reduction
+ # functionality in encore
],
},
test_suite="MDAnalysisTests",
diff --git a/testsuite/MDAnalysisTests/analysis/test_encore.py b/testsuite/MDAnalysisTests/analysis/test_encore.py
new file mode 100644
index 00000000000..b8e1636d856
--- /dev/null
+++ b/testsuite/MDAnalysisTests/analysis/test_encore.py
@@ -0,0 +1,814 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8
+#
+# MDAnalysis --- http://www.MDAnalysis.org
+# Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver Beckstein
+# and contributors (see AUTHORS for the full list)
+#
+# Released under the GNU Public Licence, v2 or any higher version
+#
+# Please cite your use of MDAnalysis in published work:
+#
+# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
+# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
+# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
+
+from __future__ import print_function
+
+import MDAnalysis as mda
+import MDAnalysis.analysis.encore as encore
+
+import tempfile
+import numpy as np
+
+from numpy.testing import (TestCase, dec, assert_equal, assert_almost_equal)
+
+from MDAnalysisTests.datafiles import DCD, DCD2, PSF
+from MDAnalysisTests import parser_not_found, module_not_found
+
+import MDAnalysis.analysis.rms as rms
+import MDAnalysis.analysis.align as align
+
+
+class TestEncore(TestCase):
+
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUp(self):
+ # Create universe from templates defined in setUpClass
+ self.ens1 = mda.Universe(
+ self.ens1_template.filename,
+ self.ens1_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ self.ens2 = mda.Universe(
+ self.ens2_template.filename,
+ self.ens2_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ def tearDown(self):
+ del self.ens1
+ del self.ens2
+
+ @classmethod
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUpClass(cls):
+ # To speed up tests, we read in trajectories from file only once,
+ # and then recreate them from their coordinate array for each test
+ super(TestEncore, cls).setUpClass()
+ cls.ens1_template = mda.Universe(PSF, DCD)
+ cls.ens2_template = mda.Universe(PSF, DCD2)
+
+ cls.ens1_template.transfer_to_memory()
+ cls.ens2_template.transfer_to_memory()
+
+ # Filter ensembles to only include every 5th frame
+ cls.ens1_template = mda.Universe(
+ cls.ens1_template.filename,
+ np.copy(cls.ens1_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+ cls.ens2_template = mda.Universe(
+ cls.ens2_template.filename,
+ np.copy(cls.ens2_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.ens1_template
+ del cls.ens2_template
+
+ @staticmethod
+ def test_triangular_matrix():
+ scalar = 2
+ size = 3
+ expected_value = 1.984
+ filename = tempfile.mktemp()+".npz"
+
+ triangular_matrix = encore.utils.TriangularMatrix(size = size)
+
+ triangular_matrix[0,1] = expected_value
+
+ assert_equal(triangular_matrix[0,1], expected_value,
+ err_msg="Data error in TriangularMatrix: read/write are not consistent")
+
+ assert_equal(triangular_matrix[0,1], triangular_matrix[1,0],
+ err_msg="Data error in TriangularMatrix: matrix non symmetrical")
+ triangular_matrix.savez(filename)
+
+ triangular_matrix_2 = encore.utils.TriangularMatrix(size = size, loadfile = filename)
+ assert_equal(triangular_matrix_2[0,1], expected_value,
+ err_msg="Data error in TriangularMatrix: loaded matrix non symmetrical")
+
+ triangular_matrix_3 = encore.utils.TriangularMatrix(size = size)
+ triangular_matrix_3.loadz(filename)
+ assert_equal(triangular_matrix_3[0,1], expected_value,
+ err_msg="Data error in TriangularMatrix: loaded matrix non symmetrical")
+
+ incremented_triangular_matrix = triangular_matrix + scalar
+ assert_equal(incremented_triangular_matrix[0,1], expected_value + scalar,
+ err_msg="Error in TriangularMatrix: addition of scalar gave\
+inconsistent results")
+
+ triangular_matrix += scalar
+ assert_equal(triangular_matrix[0,1], expected_value + scalar,
+ err_msg="Error in TriangularMatrix: addition of scalar gave\
+inconsistent results")
+
+ multiplied_triangular_matrix_2 = triangular_matrix_2 * scalar
+ assert_equal(multiplied_triangular_matrix_2[0,1], expected_value * scalar,
+ err_msg="Error in TriangularMatrix: multiplication by scalar gave\
+inconsistent results")
+
+ triangular_matrix_2 *= scalar
+ assert_equal(triangular_matrix_2[0,1], expected_value * scalar,
+ err_msg="Error in TriangularMatrix: multiplication by scalar gave\
+inconsistent results")
+
+
+ @staticmethod
+ def test_parallel_calculation():
+
+ def function(x):
+ return x**2
+
+ arguments = [tuple([i]) for i in np.arange(0,100)]
+
+ parallel_calculation = encore.utils.ParallelCalculation(function = function,
+ ncores = 4,
+ args = arguments)
+ results = parallel_calculation.run()
+
+ for i,r in enumerate(results):
+ assert_equal(r[1], arguments[i][0]**2,
+ err_msg="Unexpeted results from ParallelCalculation")
+
+ def test_rmsd_matrix_with_superimposition(self):
+ conf_dist_matrix = encore.confdistmatrix.conformational_distance_matrix(self.ens1,
+ encore.confdistmatrix.set_rmsd_matrix_elements,
+ selection = "name CA",
+ pairwise_align = True,
+ mass_weighted = True,
+ ncores = 1)
+
+ reference = rms.RMSD(self.ens1, select = "name CA")
+ reference.run()
+
+ for i,rmsd in enumerate(reference.rmsd):
+ assert_almost_equal(conf_dist_matrix[0,i], rmsd[2], decimal=3,
+ err_msg = "calculated RMSD values differ from the reference implementation")
+
+ def test_rmsd_matrix_without_superimposition(self):
+
+ reference_rmsd = [ 0. ,
+ 0.91544908,
+ 1.41318953,
+ 1.8726356 ,
+ 2.35668635,
+ 2.78433418,
+ 3.20966768,
+ 3.59671712,
+ 3.95373368,
+ 4.36574793,
+ 4.76120567,
+ 5.16000462,
+ 5.48962498,
+ 5.87683058,
+ 6.35305309,
+ 6.49553633,
+ 6.6803894 ,
+ 6.7652216 ,
+ 6.83341503,
+ 6.8028388 ]
+
+ selection_string = "name CA"
+ confdist_matrix = encore.confdistmatrix.conformational_distance_matrix(
+ self.ens1,
+ encore.confdistmatrix.set_rmsd_matrix_elements,
+ selection = selection_string,
+ pairwise_align = False,
+ mass_weighted = True,
+ ncores = 1)
+
+ print (repr(confdist_matrix.as_array()[0,:]))
+ assert_almost_equal(confdist_matrix.as_array()[0,:], reference_rmsd,
+ err_msg="calculated RMSD values differ from reference")
+
+ @staticmethod
+ def test_ensemble_superimposition():
+ aligned_ensemble1 = mda.Universe(PSF, DCD)
+ align.rms_fit_trj(aligned_ensemble1, aligned_ensemble1,
+ select="name CA",
+ in_memory=True)
+ aligned_ensemble2 = mda.Universe(PSF, DCD)
+ align.rms_fit_trj(aligned_ensemble2, aligned_ensemble2,
+ select="name *",
+ in_memory=True)
+
+ rmsfs1 = rms.RMSF(aligned_ensemble1.select_atoms('name *'))
+ rmsfs1.run()
+
+ rmsfs2 = rms.RMSF(aligned_ensemble2.select_atoms('name *'))
+ rmsfs2.run()
+
+ assert_equal(sum(rmsfs1.rmsf)>sum(rmsfs2.rmsf), True,
+ err_msg="Ensemble aligned on all atoms should have lower full-atom RMSF "
+ "than ensemble aligned on only CAs.")
+
+ @staticmethod
+ def test_ensemble_superimposition_to_reference_non_weighted():
+ ensemble0 = mda.Universe(PSF, DCD)
+ filename = align.rms_fit_trj(ensemble0, ensemble0,
+ select="name CA", mass_weighted=False)
+ aligned_ensemble0 = mda.Universe(PSF, filename)
+ aligned_ensemble1 = mda.Universe(PSF, DCD)
+ align.rms_fit_trj(aligned_ensemble1, aligned_ensemble1,
+ select="name CA", mass_weighted=False,
+ in_memory=True)
+ aligned_ensemble2 = mda.Universe(PSF, DCD)
+ align.rms_fit_trj(aligned_ensemble2, aligned_ensemble2,
+ select="name *", mass_weighted=False,
+ in_memory=True)
+
+ rmsfs0 = rms.RMSF(aligned_ensemble0.select_atoms('name *'))
+ rmsfs0.run()
+
+ rmsfs1 = rms.RMSF(aligned_ensemble1.select_atoms('name *'))
+ rmsfs1.run()
+
+ rmsfs2 = rms.RMSF(aligned_ensemble2.select_atoms('name *'))
+ rmsfs2.run()
+
+ import logging
+ logging.info("{0} {1} {2}".format(sum(rmsfs1.rmsf), sum(rmsfs2.rmsf), sum(rmsfs0.rmsf)))
+
+ assert_equal(sum(rmsfs1.rmsf)>sum(rmsfs2.rmsf), True,
+ err_msg="Ensemble aligned on all atoms should have lower full-atom RMSF "
+ "than ensemble aligned on only CAs.")
+
+ def test_hes_to_self(self):
+ results, details = encore.hes([self.ens1, self.ens1])
+ result_value = results[0,1]
+ expected_value = 0.
+ assert_almost_equal(result_value, expected_value,
+ err_msg="Harmonic Ensemble Similarity to itself not zero: {0:f}".format(result_value))
+
+ def test_hes(self):
+ results, details = encore.hes([self.ens1, self.ens2], mass_weighted=True)
+ result_value = results[0,1]
+ min_bound = 1E5
+ self.assertGreater(result_value, min_bound,
+ msg="Unexpected value for Harmonic Ensemble Similarity: {0:f}. Expected {1:f}.".format(result_value, min_bound))
+
+ def test_hes_align(self):
+ results, details = encore.hes([self.ens1, self.ens2], align=True)
+ result_value = results[0,1]
+ expected_value = 2047.05
+ assert_almost_equal(result_value, expected_value, decimal=-3,
+ err_msg="Unexpected value for Harmonic Ensemble Similarity: {0:f}. Expected {1:f}.".format(result_value, expected_value))
+
+ def test_ces_to_self(self):
+ results, details = \
+ encore.ces([self.ens1, self.ens1],
+ clustering_method=encore.AffinityPropagationNative(preference = -3.0))
+ result_value = results[0,1]
+ expected_value = 0.
+ assert_almost_equal(result_value, expected_value,
+ err_msg="ClusteringEnsemble Similarity to itself not zero: {0:f}".format(result_value))
+
+ def test_ces(self):
+ results, details = encore.ces([self.ens1, self.ens2])
+ result_value = results[0,1]
+ expected_value = 0.51
+ assert_almost_equal(result_value, expected_value, decimal=2,
+ err_msg="Unexpected value for Cluster Ensemble Similarity: {0:f}. Expected {1:f}.".format(result_value, expected_value))
+
+ @dec.skipif(module_not_found('scipy'),
+ "Test skipped because scipy is not available.")
+ def test_dres_to_self(self):
+ results, details = encore.dres([self.ens1, self.ens1])
+ result_value = results[0,1]
+ expected_value = 0.
+ assert_almost_equal(result_value, expected_value, decimal=2,
+ err_msg="Dim. Reduction Ensemble Similarity to itself not zero: {0:f}".format(result_value))
+
+ @dec.skipif(module_not_found('scipy'),
+ "Test skipped because scipy is not available.")
+ def test_dres(self):
+ results, details = encore.dres([self.ens1, self.ens2], selection="name CA and resnum 1-10")
+ result_value = results[0,1]
+ upper_bound = 0.6
+ self.assertLess(result_value, upper_bound,
+ msg="Unexpected value for Dim. reduction Ensemble Similarity: {0:f}. Expected {1:f}.".format(result_value, upper_bound))
+
+ @dec.skipif(module_not_found('scipy'),
+ "Test skipped because scipy is not available.")
+ def test_dres_without_superimposition(self):
+ distance_matrix = encore.get_distance_matrix(
+ encore.merge_universes([self.ens1, self.ens2]),
+ superimpose=False)
+ results, details = encore.dres([self.ens1, self.ens2],
+ distance_matrix = distance_matrix)
+ result_value = results[0,1]
+ expected_value = 0.68
+ assert_almost_equal(result_value, expected_value, decimal=1,
+ err_msg="Unexpected value for Dim. reduction Ensemble Similarity: {0:f}. Expected {1:f}.".format(result_value, expected_value))
+
+ def test_ces_convergence(self):
+ expected_values = [0.3443593, 0.1941854, 0.06857104, 0.]
+ results = encore.ces_convergence(self.ens1, 5)
+ print (results)
+ for i,ev in enumerate(expected_values):
+ assert_almost_equal(ev, results[i], decimal=2,
+ err_msg="Unexpected value for Clustering Ensemble similarity in convergence estimation")
+ @dec.skipif(module_not_found('scipy'),
+ "Test skipped because scipy is not available.")
+ def test_dres_convergence(self):
+ expected_values = [ 0.3, 0.]
+ results = encore.dres_convergence(self.ens1, 10)
+ assert_almost_equal(results[:,0], expected_values, decimal=1,
+ err_msg="Unexpected value for Dim. reduction Ensemble similarity in convergence estimation")
+
+ @dec.slow
+ def test_hes_error_estimation(self):
+ expected_average = 10
+ expected_stdev = 12
+ averages, stdevs = encore.hes([self.ens1, self.ens1], estimate_error = True, bootstrapping_samples=10, selection="name CA and resnum 1-10")
+ average = averages[0,1]
+ stdev = stdevs[0,1]
+
+ assert_almost_equal(average, expected_average, decimal=-2,
+ err_msg="Unexpected average value for bootstrapped samples in Harmonic Ensemble imilarity")
+ assert_almost_equal(stdev, expected_stdev, decimal=-2,
+ err_msg="Unexpected standard daviation for bootstrapped samples in Harmonic Ensemble imilarity")
+
+ @dec.slow
+ def test_ces_error_estimation(self):
+ expected_average = 0.03
+ expected_stdev = 0.31
+ averages, stdevs = encore.ces([self.ens1, self.ens1],
+ estimate_error = True,
+ bootstrapping_samples=10,
+ clustering_method=encore.AffinityPropagationNative(preference=-2.0),
+ selection="name CA and resnum 1-10")
+ average = averages[0,1]
+ stdev = stdevs[0,1]
+
+ assert_almost_equal(average, expected_average, decimal=1,
+ err_msg="Unexpected average value for bootstrapped samples in Clustering Ensemble similarity")
+ assert_almost_equal(stdev, expected_stdev, decimal=0,
+ err_msg="Unexpected standard daviation for bootstrapped samples in Clustering Ensemble similarity")
+
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ @dec.slow
+ def test_ces_error_estimation_ensemble_bootstrap(self):
+ # Error estimation using a method that does not take a distance
+ # matrix as input, and therefore relies on bootstrapping the ensembles
+ # instead
+ expected_average = 0.03
+ expected_stdev = 0.02
+ averages, stdevs = encore.ces([self.ens1, self.ens1],
+ estimate_error = True,
+ bootstrapping_samples=10,
+ clustering_method=encore.KMeans(n_clusters=2),
+ selection="name CA and resnum 1-10")
+ average = averages[0,1]
+ stdev = stdevs[0,1]
+
+ assert_almost_equal(average, expected_average, decimal=1,
+ err_msg="Unexpected average value for bootstrapped samples in Clustering Ensemble similarity")
+ assert_almost_equal(stdev, expected_stdev, decimal=1,
+ err_msg="Unexpected standard daviation for bootstrapped samples in Clustering Ensemble similarity")
+
+ @dec.slow
+ @dec.skipif(module_not_found('scipy'),
+ "Test skipped because scipy is not available.")
+ def test_dres_error_estimation(self):
+ average_upper_bound = 0.3
+ stdev_upper_bound = 0.2
+ averages, stdevs = encore.dres([self.ens1, self.ens1], estimate_error = True,
+ bootstrapping_samples=10,
+ selection="name CA and resnum 1-10")
+ average = averages[0,1]
+ stdev = stdevs[0,1]
+
+ self.assertLess(average, average_upper_bound,
+ msg="Unexpected average value for bootstrapped samples in Dim. reduction Ensemble similarity")
+ self.assertLess(stdev, stdev_upper_bound,
+ msg="Unexpected standard deviation for bootstrapped samples in Dim. reduction Ensemble imilarity")
+
+
+
+class TestEncoreClustering(TestCase):
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUp(self):
+ # Create universe from templates defined in setUpClass
+ self.ens1 = mda.Universe(
+ self.ens1_template.filename,
+ self.ens1_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ self.ens2 = mda.Universe(
+ self.ens2_template.filename,
+ self.ens2_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ def tearDownClass(self):
+ del self.ens1
+ del self.ens2
+
+ @classmethod
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUpClass(cls):
+ # To speed up tests, we read in trajectories from file only once,
+ # and then recreate them from their coordinate array for each test
+ super(TestEncoreClustering, cls).setUpClass()
+ cls.ens1_template = mda.Universe(PSF, DCD)
+ cls.ens2_template = mda.Universe(PSF, DCD2)
+
+ cls.ens1_template.transfer_to_memory()
+ cls.ens2_template.transfer_to_memory()
+
+ # Filter ensembles to only include every 5th frame
+ cls.ens1_template = mda.Universe(
+ cls.ens1_template.filename,
+ np.copy(cls.ens1_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+ cls.ens2_template = mda.Universe(
+ cls.ens2_template.filename,
+ np.copy(cls.ens2_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.ens1_template
+ del cls.ens2_template
+
+ @dec.slow
+ def test_clustering_one_ensemble(self):
+ cluster_collection = encore.cluster(self.ens1)
+ expected_value = 7
+ assert_equal(len(cluster_collection), expected_value,
+ err_msg="Unexpected results: {0}".format(cluster_collection))
+
+ @dec.slow
+ def test_clustering_two_ensembles(self):
+ cluster_collection = encore.cluster([self.ens1, self.ens2])
+ expected_value = 14
+ assert_equal(len(cluster_collection), expected_value,
+ err_msg="Unexpected results: {0}".format(cluster_collection))
+
+ @dec.slow
+ def test_clustering_three_ensembles_two_identical(self):
+ cluster_collection = encore.cluster([self.ens1, self.ens2, self.ens1])
+ expected_value = 40
+ assert_equal(len(cluster_collection), expected_value,
+ err_msg="Unexpected result: {0}".format(cluster_collection))
+
+ @dec.slow
+ def test_clustering_two_methods(self):
+ cluster_collection = encore.cluster(
+ [self.ens1],
+ method=[encore.AffinityPropagationNative(),
+ encore.AffinityPropagationNative()])
+ assert_equal(len(cluster_collection[0]), len(cluster_collection[1]),
+ err_msg="Unexpected result: {0}".format(cluster_collection))
+
+ @dec.slow
+ def test_clustering_AffinityPropagationNative_direct(self):
+ method = encore.AffinityPropagationNative()
+ distance_matrix = encore.get_distance_matrix(self.ens1)
+ cluster_assignment, details = method(distance_matrix)
+ expected_value = 7
+ assert_equal(len(set(cluster_assignment)), expected_value,
+ err_msg="Unexpected result: {0}".format(
+ cluster_assignment))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_AffinityPropagation_direct(self):
+ method = encore.AffinityPropagation()
+ distance_matrix = encore.get_distance_matrix(self.ens1)
+ cluster_assignment, details = method(distance_matrix)
+ expected_value = 7
+ assert_equal(len(set(cluster_assignment)), expected_value,
+ err_msg="Unexpected result: {0}".format(
+ cluster_assignment))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_KMeans_direct(self):
+ clusters = 10
+ method = encore.KMeans(clusters)
+ coordinates = self.ens1.trajectory.timeseries(format='fac')
+ coordinates = np.reshape(coordinates,
+ (coordinates.shape[0], -1))
+ cluster_assignment, details = method(coordinates)
+ assert_equal(len(set(cluster_assignment)), clusters,
+ err_msg="Unexpected result: {0}".format(
+ cluster_assignment))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_DBSCAN_direct(self):
+ method = encore.DBSCAN(eps=0.5, min_samples=2)
+ distance_matrix = encore.get_distance_matrix(self.ens1)
+ cluster_assignment, details = method(distance_matrix)
+ expected_value = 2
+ assert_equal(len(set(cluster_assignment)), expected_value,
+ err_msg="Unexpected result: {0}".format(
+ cluster_assignment))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_two_different_methods(self):
+ cluster_collection = encore.cluster(
+ [self.ens1],
+ method=[encore.AffinityPropagation(preference=-7.5),
+ encore.DBSCAN(min_samples=2)])
+ print(cluster_collection)
+ print (cluster_collection)
+ assert_equal(len(cluster_collection[0]), len(cluster_collection[1]),
+ err_msg="Unexpected result: {0}".format(cluster_collection))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_method_w_no_distance_matrix(self):
+ cluster_collection = encore.cluster(
+ [self.ens1],
+ method=encore.KMeans(10))
+ print(cluster_collection)
+ assert_equal(len(cluster_collection), 10,
+ err_msg="Unexpected result: {0}".format(cluster_collection))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_clustering_two_methods_one_w_no_distance_matrix(self):
+ cluster_collection = encore.cluster(
+ [self.ens1],
+ method=[encore.KMeans(17),
+ encore.AffinityPropagationNative()])
+ print(cluster_collection)
+ assert_equal(len(cluster_collection[0]), len(cluster_collection[0]),
+ err_msg="Unexpected result: {0}".format(cluster_collection))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_sklearn_affinity_propagation(self):
+ cc1 = encore.cluster([self.ens1])
+ cc2 = encore.cluster([self.ens1],
+ method=encore.AffinityPropagation())
+ assert_equal(len(cc1), len(cc2),
+ err_msg="Native and sklearn implementations of affinity "
+ "propagation don't agree: mismatch in number of "
+ "clusters: {0} {1}".format(len(cc1), len(cc2)))
+
+
+
+
+class TestEncoreClusteringSklearn(TestCase):
+ """The tests in this class were duplicated from the affinity propagation
+ tests in scikit-learn"""
+
+ def setUp(self):
+ self.n_clusters = 3
+ # self.X was generated using the following sklearn code
+ # self.centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
+ # from sklearn.datasets.samples_generator import make_blobs
+ # self.X, _ = make_blobs(n_samples=60, n_features=2, centers=self.centers,
+ # cluster_std=0.4, shuffle=True, random_state=0)
+ X = np.array([[ 8.73101582, 8.85617874],
+ [ 11.61311169, 11.58774351],
+ [ 10.86083514, 11.06253959],
+ [ 9.45576027, 8.50606967],
+ [ 11.30441509, 11.04867001],
+ [ 8.63708065, 9.02077816],
+ [ 8.34792066, 9.1851129 ],
+ [ 11.06197897, 11.15126501],
+ [ 11.24563175, 9.36888267],
+ [ 10.83455241, 8.70101808],
+ [ 11.49211627, 11.48095194],
+ [ 10.6448857 , 10.20768141],
+ [ 10.491806 , 9.38775868],
+ [ 11.08330999, 9.39065561],
+ [ 10.83872922, 9.48897803],
+ [ 11.37890079, 8.93799596],
+ [ 11.70562094, 11.16006288],
+ [ 10.95871246, 11.1642394 ],
+ [ 11.59763163, 10.91793669],
+ [ 11.05761743, 11.5817094 ],
+ [ 8.35444086, 8.91490389],
+ [ 8.79613913, 8.82477028],
+ [ 11.00420001, 9.7143482 ],
+ [ 11.90790185, 10.41825373],
+ [ 11.39149519, 11.89635728],
+ [ 8.31749192, 9.78031016],
+ [ 11.59530088, 9.75835567],
+ [ 11.17754529, 11.13346973],
+ [ 11.01830341, 10.92512646],
+ [ 11.75326028, 8.46089638],
+ [ 11.74702358, 9.36241786],
+ [ 10.53075064, 9.77744847],
+ [ 8.67474149, 8.30948696],
+ [ 11.05076484, 9.16079575],
+ [ 8.79567794, 8.52774713],
+ [ 11.18626498, 8.38550253],
+ [ 10.57169895, 9.42178069],
+ [ 8.65168114, 8.76846013],
+ [ 11.12522708, 10.6583617 ],
+ [ 8.87537899, 9.02246614],
+ [ 9.29163622, 9.05159316],
+ [ 11.38003537, 10.93945712],
+ [ 8.74627116, 8.85490353],
+ [ 10.65550973, 9.76402598],
+ [ 8.49888186, 9.31099614],
+ [ 8.64181338, 9.154761 ],
+ [ 10.84506927, 10.8790789 ],
+ [ 8.98872711, 9.17133275],
+ [ 11.7470232 , 10.60908885],
+ [ 10.89279865, 9.32098256],
+ [ 11.14254656, 9.28262927],
+ [ 9.02660689, 9.12098876],
+ [ 9.16093666, 8.72607596],
+ [ 11.47151183, 8.92803007],
+ [ 11.76917681, 9.59220592],
+ [ 9.97880407, 11.26144744],
+ [ 8.58057881, 8.43199283],
+ [ 10.53394006, 9.36033059],
+ [ 11.34577448, 10.70313399],
+ [ 9.07097046, 8.83928763]])
+
+ XX = np.einsum('ij,ij->i', X, X)[:, np.newaxis]
+ YY = XX.T
+ distances = np.dot(X, X.T)
+ distances *= -2
+ distances += XX
+ distances += YY
+ np.maximum(distances, 0, out=distances)
+ distances.flat[::distances.shape[0] + 1] = 0.0
+ dimension = len(distances)
+ self.distance_matrix = encore.utils.TriangularMatrix(len(distances))
+ for i in range(dimension):
+ for j in range(i,dimension):
+ self.distance_matrix[i, j] = distances[i,j]
+
+ def test_one(self):
+ preference = -float(np.median(self.distance_matrix.as_array()) * 10.)
+ clustering_method = encore.AffinityPropagationNative(preference=preference)
+ ccs = encore.cluster(None,
+ distance_matrix=self.distance_matrix,
+ method=clustering_method)
+ assert_equal(self.n_clusters, len(ccs),
+ err_msg="Basic clustering test failed to give the right"
+ "number of clusters: {0} vs {1}".format(self.n_clusters, len(ccs)))
+
+
+class TestEncoreDimensionalityReduction(TestCase):
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUp(self):
+ # Create universe from templates defined in setUpClass
+ self.ens1 = mda.Universe(
+ self.ens1_template.filename,
+ self.ens1_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ self.ens2 = mda.Universe(
+ self.ens2_template.filename,
+ self.ens2_template.trajectory.timeseries(),
+ format=mda.coordinates.memory.MemoryReader)
+
+ def tearDownClass(self):
+ del self.ens1
+ del self.ens2
+
+ @classmethod
+ @dec.skipif(parser_not_found('DCD'),
+ 'DCD parser not available. Are you using python 3?')
+ def setUpClass(cls):
+ # To speed up tests, we read in trajectories from file only once,
+ # and then recreate them from their coordinate array for each test
+ super(TestEncoreDimensionalityReduction, cls).setUpClass()
+ cls.ens1_template = mda.Universe(PSF, DCD)
+ cls.ens2_template = mda.Universe(PSF, DCD2)
+
+ cls.ens1_template.transfer_to_memory()
+ cls.ens2_template.transfer_to_memory()
+
+ # Filter ensembles to only include every 5th frame
+ cls.ens1_template = mda.Universe(
+ cls.ens1_template.filename,
+ np.copy(cls.ens1_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+ cls.ens2_template = mda.Universe(
+ cls.ens2_template.filename,
+ np.copy(cls.ens2_template.trajectory.timeseries()[:, ::5, :]),
+ format=mda.coordinates.memory.MemoryReader)
+
+ @classmethod
+ def tearDownClass(cls):
+ del cls.ens1_template
+ del cls.ens2_template
+
+ @dec.slow
+ def test_dimensionality_reduction_one_ensemble(self):
+ dimension = 2
+ coordinates, details = encore.reduce_dimensionality(self.ens1)
+ print (coordinates)
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(coordinates))
+
+ @dec.slow
+ def test_dimensionality_reduction_two_ensembles(self):
+ dimension = 2
+ coordinates, details = \
+ encore.reduce_dimensionality([self.ens1, self.ens2])
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(coordinates))
+
+ @dec.slow
+ def test_dimensionality_reduction_three_ensembles_two_identical(self):
+ coordinates, details = \
+ encore.reduce_dimensionality([self.ens1, self.ens2, self.ens1])
+ coordinates_ens1 = coordinates[:,np.where(details["ensemble_membership"]==1)]
+ coordinates_ens3 = coordinates[:,np.where(details["ensemble_membership"]==3)]
+ assert_almost_equal(coordinates_ens1, coordinates_ens3, decimal=0,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(coordinates))
+
+ @dec.slow
+ def test_dimensionality_reduction_specified_dimension(self):
+ dimension = 3
+ coordinates, details = encore.reduce_dimensionality(
+ [self.ens1, self.ens2],
+ method=encore.StochasticProximityEmbeddingNative(dimension=dimension))
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(coordinates))
+
+ @dec.slow
+ def test_dimensionality_reduction_SPENative_direct(self):
+ dimension = 2
+ method = encore.StochasticProximityEmbeddingNative(dimension=dimension)
+ distance_matrix = encore.get_distance_matrix(self.ens1)
+ coordinates, details = method(distance_matrix)
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(
+ coordinates))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_dimensionality_reduction_PCA_direct(self):
+ dimension = 2
+ method = encore.PrincipleComponentAnalysis(dimension=dimension)
+ coordinates = self.ens1.trajectory.timeseries(format='fac')
+ coordinates = np.reshape(coordinates,
+ (coordinates.shape[0], -1))
+ coordinates, details = method(coordinates)
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(
+ coordinates))
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_dimensionality_reduction_different_method(self):
+ dimension = 3
+ coordinates, details = \
+ encore.reduce_dimensionality(
+ [self.ens1, self.ens2],
+ method=encore.PrincipleComponentAnalysis(dimension=dimension))
+ assert_equal(coordinates.shape[0], dimension,
+ err_msg="Unexpected result in dimensionality reduction: {0}".format(coordinates))
+
+ @dec.slow
+ def test_dimensionality_reduction_two_methods(self):
+ dims = [2,3]
+ coordinates, details = \
+ encore.reduce_dimensionality(
+ [self.ens1, self.ens2],
+ method=[encore.StochasticProximityEmbeddingNative(dims[0]),
+ encore.StochasticProximityEmbeddingNative(dims[1])])
+ assert_equal(coordinates[1].shape[0], dims[1])
+
+ @dec.slow
+ @dec.skipif(module_not_found('sklearn'),
+ "Test skipped because sklearn is not available.")
+ def test_dimensionality_reduction_two_different_methods(self):
+ dims = [2,3]
+ coordinates, details = \
+ encore.reduce_dimensionality(
+ [self.ens1, self.ens2],
+ method=[encore.StochasticProximityEmbeddingNative(dims[0]),
+ encore.PrincipleComponentAnalysis(dims[1])])
+ assert_equal(coordinates[1].shape[0], dims[1])
+
diff --git a/testsuite/MDAnalysisTests/coordinates/base.py b/testsuite/MDAnalysisTests/coordinates/base.py
index a286825525c..91ba506ae43 100644
--- a/testsuite/MDAnalysisTests/coordinates/base.py
+++ b/testsuite/MDAnalysisTests/coordinates/base.py
@@ -310,7 +310,7 @@ def test_iter_as_aux_lowf(self):
# auxiliary has a lower frequency, so iter_as_aux should iterate over
# only frames where there is a corresponding auxiliary value
for i, ts in enumerate(self.reader.iter_as_aux('lowf')):
- assert_timestep_almost_equal(ts,
+ assert_timestep_almost_equal(ts,
self.ref.iter_ts(self.ref.aux_lowf_frames_with_steps[i]),
decimal=self.ref.prec)