From 7058792d78f39078e5d5751deb102f7fd6a0caf4 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Thu, 23 Mar 2017 01:21:15 -0700 Subject: [PATCH 1/2] analysis.gnm: optimizations and doc/code cleanup - optimization - eliminated costly lookup of residue sizes - reduces run time of TestGNM.test_closeContactGNMAnalysis by about a factor of 5 (!) --- should go a long way to improve #1191 - made sure that numpy arrays are used - argument changes and deprecations - new weights="size" kwargs - deprecated MassWeight for 0.17.0 (but have tests until then that check that it works and overrides weights) - in GNMAnalysis, replaced skip kwarg with start/stop/step (without going through deprecation because of limited use) - added/adapted tests for new kwargs - code cleanup - removed unused backup_file() function - made GNMAnalysis._generate_output() a private method (so that I do not have to document it...) - numpy-style docs in whole module --- package/CHANGELOG | 12 +- package/MDAnalysis/analysis/gnm.py | 258 +++++++++++------- .../MDAnalysisTests/analysis/test_gnm.py | 24 +- 3 files changed, 188 insertions(+), 106 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 85644181723..aab5164bf18 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -71,6 +71,9 @@ Enhancements * Groups (atomgroup, residuegroup, and segmentgroup) have more operators, included set operators (Issue #726) * Universes built with Merge now use the MemoryReader (Issue #1251) + * speed improvement for analysis.gnm.closeContactGNMAnalysis(..., + weights="size") by about 5x (partially Issue #1191) + Fixes * Trajectory slicing made completely Pythonic (Issue #918 PR #1195) @@ -153,13 +156,20 @@ Changes * Remove deprecated PrimitivePDB* classes * Remove deprecated `delta` keyword for ChainReader * Remove deprecated permissive_pdb_reader flag - + * Removed superfluous and confusing keywords `start` and `end` for resid + selection in analysis.helanal.helanal_main() and + analysis.helanal.helanal_trajectory() + * weights="size" parameter in analysis.gnm.closeContactGNMAnalysis to + replace MassWeight=True Deprecations (Issue #599) * Use of rms_fit_trj deprecated in favor of AlignTraj class (Issue #845) * Moved analysis.x3dna to the analysis.legacy module (Issue #906) * The keyword argument *quiet* is deprecated in favor of *verbose* throughout the library (Issue #903) + * MassWeight=True parameter in analysis.gnm.closeContactGNMAnalysis + deprecated in favor of weights="size". + 05/15/16 jandom, abhinavgupta94, orbeckst, kain88-de, hainm, jbarnoud, dotsdl, richardjgowers, BartBruininks, jdetle diff --git a/package/MDAnalysis/analysis/gnm.py b/package/MDAnalysis/analysis/gnm.py index 5557b725dc4..8bde527abe2 100644 --- a/package/MDAnalysis/analysis/gnm.py +++ b/package/MDAnalysis/analysis/gnm.py @@ -1,5 +1,5 @@ # -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- -# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 # # MDAnalysis --- http://www.mdanalysis.org # Copyright (c) 2006-2016 The MDAnalysis Development Team and contributors @@ -43,16 +43,15 @@ The basic approach is to pass a trajectory to :class:`GNMAnalysis` and then run the analysis:: - u = MDAnalysis.Universe(PSF,DCD) - C = MDAnalysis.analysis.gnm.GNMAnalysis(u,ReportVector="output.txt") + u = MDAnalysis.Universe(PSF, DCD) + C = MDAnalysis.analysis.gnm.GNMAnalysis(u, ReportVector="output.txt") C.run() output = zip(*C.results) - outputfile = open("eigenvalues.dat","w") - for item in output[1]: - print >> outputfile, item - outputfile.close() + with open("eigenvalues.dat", "w") as outputfile: + for item in output[1]: + outputfile.write(item + "\n") The results are found in :attr:`GNMAnalysis.results`, which can be used for further processing (see [Hall2007]_). @@ -78,23 +77,20 @@ The following functions are used internally and are typically not directly needed to perform the analysis. -.. autofunction:: backup_file .. autofunction:: generate_grid .. autofunction:: order_list -""" +.. versionchanged:: 0.16.0 + removed un-unsed function :func:`backup_file` -# import copy #unused +""" from __future__ import print_function from six.moves import range import numpy as np -from numpy import linalg -import os - -#import warnings #unused +import warnings import logging logger = logging.getLogger('MDAnalysis.analysis.GNM') @@ -103,43 +99,34 @@ def _dsq(a, b): return ((a - b)**2).sum() -def backup_file(filename): - ''' - This function helps prevent overwriting default named files - ''' - if os.path.exists(filename): - target_name = "#" + filename - failure = True - if not os.path.exists(target_name): - os.rename(filename, target_name) - failure = False - else: - for i in range(20): - alt_target_name = target_name + "." + str(i) - if os.path.exists(alt_target_name): - continue - else: - os.rename(filename, alt_target_name) - failure = False - break - if failure: - print("Too many backups. Clean up and try again") - exit() +def generate_grid(positions, cutoff): + """Simple grid search. + + An alternative to searching the entire list of each atom; divide the + structure into `cutoff` sized boxes This way, for each particle you only need + to search the neighbouring boxes to find the particles within the `cutoff`. + Observed a 6x speed up for a smallish protein with ~300 residues; this + should get better with bigger systems. -def generate_grid(positions, cutoff): - ''' - An alternative to searching the entire list of each atom; divide the structure into cutoff sized boxes - This way, for each particle you only need to search the neighbouring boxes to find the particles within the cutoff - Observed a 6x speed up for a smallish protein with ~300 residues; this should get better with bigger systems. - ''' - [x, y, z] = zip(*positions) - high_x = max(x) - high_y = max(y) - high_z = max(z) - low_x = min(x) - low_y = min(y) - low_z = min(z) + Parameters + ---------- + positions : array + coordinates of the atoms + cutoff : float + find particles with distance less than `cutoff` from each other; the + grid will consist of boxes with sides of at least length `cutoff` + + """ + positions = np.asarray(positions) + + x, y, z = positions.T + high_x = x.max() + high_y = y.max() + high_z = z.max() + low_x = x.min() + low_y = y.min() + low_z = z.min() natoms = len(positions) #Ok now generate a list with 3 dimensions representing boxes in x, y and z grid = [[[ @@ -156,9 +143,7 @@ def generate_grid(positions, cutoff): def order_list(w): - ''' - Returns a dictionary showing the order of eigenvalues (which are reported scrambled normally) - ''' + """Returns a dictionary showing the order of eigenvalues (which are reported scrambled normally)""" ordered = list(w) unordered = list(w) ordered.sort() @@ -169,39 +154,61 @@ def order_list(w): class GNMAnalysis(object): - '''Basic tool for GNM analysis. + """Basic tool for GNM analysis. Each frame is treated as a novel structure and the GNM calculated. By default, this stores the dominant eigenvector and its associated eigenvalue; either can be used to monitor conformational change in a simulation. - ''' - def __init__(self, universe, selection='protein and name CA', cutoff=7.0, ReportVector=None, Bonus_groups=()): + Parameters + ---------- + universe : Universe + Analyze the full trajectory in the universe. + selection : str (optional) + MDAnalysis selection string, default "protein and name CA" + cutoff : float (optional) + Consider selected atoms within the cutoff as neighbors for the + Gaussian network model. + ReportVector : str (optional) + filename to write eigenvectors to, by default no output is written + (``None``) + Bonus_groups : tuple + This is a tuple of selection groups, the com of each which will be + added as a single point in the ENM (its a popular way of treating + small ligands such as drugs) but be careful that it doesn't have any + atoms which can be caught by the global selection as this could lead + to double counting. Default is ``None``. + + .. SeeAlso:: :class:`closeContactGNMAnalysis` + + .. versionchanged:: 0.16.0 + Made :meth:`generate_output` a private method :meth:`_generate_output`. + """ + + def __init__(self, universe, selection='protein and name CA', cutoff=7.0, + ReportVector=None, Bonus_groups=None): self.u = universe self.selection = selection self.cutoff = cutoff self.results = [] # final result self._timesteps = None # time for each frame self.ReportVector = ReportVector - # this is a tuple of selection groups, the com of each which will be added as a single point in the ENM - # (its a popular way of treating small ligands eg drugs) - # be careful that it doesn't have any atoms which can be caught by the global selection as this could lead to - # double counting - self.Bonus_groups = [self.u.select_atoms(item) for item in Bonus_groups] + self.Bonus_groups = [self.u.select_atoms(item) for item in Bonus_groups] \ + if Bonus_groups else [] self.ca = self.u.select_atoms(self.selection) - def generate_output(self, w, v, outputobject, time, matrix, nmodes=2, ReportVector=None, counter=0): - '''Appends eigenvalues and eigenvectors to results. + def _generate_output(self, w, v, outputobject, time, matrix, nmodes=2, + ReportVector=None, counter=0): + """Appends eigenvalues and eigenvectors to results. This generates the output by adding eigenvalue and eigenvector data to an appendable object and optionally printing some of the results to file. This is the function to replace if you want to generate a more complex set of outputs - ''' + """ list_map = order_list(w) - #print round(time), w[list_map[1]] if ReportVector: with open(ReportVector, "a") as oup: for item in enumerate(v[list_map[1]]): @@ -212,13 +219,17 @@ def generate_output(self, w, v, outputobject, time, matrix, nmodes=2, ReportVect # nmodes) ] )) def generate_kirchoff(self): - '''Generate the Kirchhoff matrix of contacts. + """Generate the Kirchhoff matrix of contacts. This generates the neighbour matrix by generating a grid of near-neighbours and then calculating which are are within - the cutoff. Returns the resulting matrix - ''' - #ca = self.u.select_atoms(self.selection) + the cutoff. + + Returns + ------- + array + the resulting Kirchhoff matrix + """ positions = self.ca.positions natoms = len(positions) @@ -259,38 +270,43 @@ def generate_kirchoff(self): matrix[jcounter][jcounter] = matrix[jcounter][jcounter] + 1 return matrix - def run(self, skip=1): - '''Analyze trajectory and produce timeseries. + def run(self, start=None, stop=None, step=None): + """Analyze trajectory and produce timeseries. + + Parameters + ---------- + start : int (optional) + stop : int (optional) + step : int (optional) - Returns GNM results per frame:: + Returns + ------- + results : list + GNM results per frame:: - results = [(time,eigenvalues[1],eigenvectors[1]),(time,eigenvalues[1],eigenvectors[1])... ] + results = [(time,eigenvalues[1],eigenvectors[1]),(time,eigenvalues[1],eigenvectors[1])... ] - ''' + .. versionchanged:: 0.16.0 + use start, stop, step instead of skip + """ logger.info("GNM analysis: starting") - counter = 0 self.timeseries = [] self._timesteps = [] - for ts in self.u.trajectory: - if counter % skip != 0: - counter += 1 - continue - counter += 1 - frame = ts.frame - timestep = ts.time - self._timesteps.append(timestep) + for ts in self.u.trajectory[start:stop:step]: + self._timesteps.append(ts.time) matrix = self.generate_kirchoff() try: - [u, w, v] = linalg.svd(matrix) - except linalg.LinAlgError: - print("\nFrame skip at", timestep, + u, w, v = np.linalg.svd(matrix) + except np.linalg.LinAlgError: + print("\nFrame skip at", ts.time, "(SVD failed to converge). Cutoff", self.cutoff) continue #Save the results somewhere useful in some useful format. Usefully. - self.generate_output(w, v, self.results, timestep, matrix, ReportVector=self.ReportVector, counter=counter) + self._generate_output(w, v, self.results, ts.time, matrix, ReportVector=self.ReportVector, + counter=ts.frame) class closeContactGNMAnalysis(GNMAnalysis): @@ -298,10 +314,44 @@ class closeContactGNMAnalysis(GNMAnalysis): This is a version of the GNM where the Kirchoff matrix is constructed from the close contacts between individual atoms - in different residues + in different residues. + + Parameters + ---------- + universe : Universe + Analyze the full trajectory in the universe. + selection : str (optional) + MDAnalysis selection string, default "protein" + cutoff : float (optional) + Consider selected atoms within the cutoff as neighbors for the + Gaussian network model [4.5 Å]. + ReportVector : str (optional) + filename to write eigenvectors to, by default no output is written + (``None``) + weights : {"size", None} (optional) + If set to "size" (the default) then weight the contact by + :math:`1/\sqrt{N_i N_j}` where :math:`N_i` and :math:`N_j` are the + number of atoms in the residues :math:`i` and :math:`j` that contain + the atoms that form a contact. + MassWeight : bool (deprecated, optional) + if set to ``True`` equivalent to `weights` set to "size". + .. Note:: This option does not perform a true mass weighting but + weighting by the number of atoms in each residue; the name + of the parameter exists for historical reasons and will + be removed in 0.17.0. + + .. SeeAlso:: :class:`GNMAnalysis` + + .. versionchanged:: 0.16.0 + Made :meth:`generate_output` a private method :meth:`_generate_output`. + + .. deprecated:: 0.16.0 + Instead of ``MassWeight=True`` use ``weights="size"``. """ - def __init__(self, universe, selection='protein', cutoff=4.5, ReportVector=None, MassWeight=True): + def __init__(self, universe, selection='protein', cutoff=4.5, ReportVector=None, + weights="size", + MassWeight=None): self.u = universe self.selection = selection self.cutoff = cutoff @@ -311,17 +361,28 @@ def __init__(self, universe, selection='protein', cutoff=4.5, ReportVector=None, # no bonus groups in this version of the GNM analysis tool; this is because this version doesn't use CA atoms # or centroids, and so doesn't need them self.ca = self.u.select_atoms(self.selection) - self.MassWeight = MassWeight + + self.weights = weights + # remove MassWeight in 0.17.0 + if MassWeight is not None: + warnings.warn("MassWeight=True|False is deprecated in favor of weights='size'|None " + "and will be removed in 0.17.0", + category=DeprecationWarning) + self.weights = "size" if MassWeight else None + def generate_kirchoff(self): - natoms = len(self.ca.atoms) - nresidues = len(self.ca.residues) + natoms = self.ca.n_atoms + nresidues = self.ca.n_residues positions = self.ca.positions res_positions, grid, low_x, low_y, low_z = generate_grid(positions, self.cutoff) residue_index_map = [resnum for [resnum, residue] in enumerate(self.ca.residues) for atom in residue.atoms] - matrix = np.zeros((nresidues, nresidues), "float") + matrix = np.zeros((nresidues, nresidues), dtype=np.float_) cutoffsq = self.cutoff ** 2 + # cache sqrt of residue sizes (slow) so that sr[i]*sr[j] == sqrt(r[i]*r[j]) + sqrt_res_sizes = np.sqrt([r.atoms.n_atoms for r in self.ca.residues]) if self.weights == "size" else None + for icounter in range(natoms): neighbour_atoms = [] for x in (-1, 0, 1): @@ -337,12 +398,9 @@ def generate_kirchoff(self): for jcounter in neighbour_atoms: if jcounter > icounter and _dsq(positions[icounter], positions[jcounter]) <= cutoffsq: iresidue, jresidue = residue_index_map[icounter], residue_index_map[jcounter] - if self.MassWeight: - contact = 1.0 / (len(self.ca.residues[iresidue].atoms) * len(self.ca.residues[jresidue].atoms)) ** 0.5 - else: - contact = 1.0 - matrix[iresidue][jresidue] = matrix[iresidue][jresidue] - contact - matrix[jresidue][iresidue] = matrix[jresidue][iresidue] - contact - matrix[iresidue][iresidue] = matrix[iresidue][iresidue] + contact - matrix[jresidue][jresidue] = matrix[jresidue][jresidue] + contact + contact = 1.0 / (sqrt_res_sizes[iresidue] * sqrt_res_sizes[jresidue]) if self.weights == "size" else 1.0 + matrix[iresidue][jresidue] -= contact + matrix[jresidue][iresidue] -= contact + matrix[iresidue][iresidue] += contact + matrix[jresidue][jresidue] += contact return matrix diff --git a/testsuite/MDAnalysisTests/analysis/test_gnm.py b/testsuite/MDAnalysisTests/analysis/test_gnm.py index 599f2feed35..07722052027 100644 --- a/testsuite/MDAnalysisTests/analysis/test_gnm.py +++ b/testsuite/MDAnalysisTests/analysis/test_gnm.py @@ -53,9 +53,9 @@ def test_gnm(self): 3.9607304e-15, 4.1289113e-15, 2.5501084e-15, 4.0498182e-15, 4.2058769e-15, 3.9839431e-15]) - def test_gnm_run_skip(self): + def test_gnm_run_step(self): gnm = MDAnalysis.analysis.gnm.GNMAnalysis(self.universe) - gnm.run(skip=3) + gnm.run(step=3) result = gnm.results assert_equal(len(result), 4) time, eigenvalues, eigenvectors = zip(*result) @@ -83,7 +83,7 @@ def test_generate_kirchoff(self): @attr('slow') def test_closeContactGNMAnalysis(self): - gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe) + gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe, weights="size") gnm.run() result = gnm.results @@ -113,8 +113,8 @@ def test_closeContactGNMAnalysis(self): 0.0, 0.0, -2.263157894736841, -0.24333213169614382]) @attr('slow') - def test_closeContactGNMAnalysis_noMassWeight(self): - gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe, MassWeight=False) + def test_closeContactGNMAnalysis_weights_None(self): + gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe, weights=None) gnm.run() result = gnm.results @@ -138,3 +138,17 @@ def test_closeContactGNMAnalysis_noMassWeight(self): 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -43.0, -3.0]) + + + def test_closeContactGNMAnalysis_default_weights_is_size(self): + gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe) + assert_equal(gnm.weights, "size") + + def test_closeContactGNMAnalysis_deprecated_MassWeight_False(self): + gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe, MassWeight=False) + assert_equal(gnm.weights, None) + + def test_closeContactGNMAnalysis_deprecated_MassWeight_True(self): + gnm = MDAnalysis.analysis.gnm.closeContactGNMAnalysis(self.universe, MassWeight=True) + assert_equal(gnm.weights, "size") + From 6145479e88b5f7e2a6051d78533c67d5892afaaa Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Fri, 31 Mar 2017 00:37:32 -0700 Subject: [PATCH 2/2] analysis.gnm: implemented review comments (#1272) --- package/MDAnalysis/analysis/gnm.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/package/MDAnalysis/analysis/gnm.py b/package/MDAnalysis/analysis/gnm.py index 8bde527abe2..98299d0f1af 100644 --- a/package/MDAnalysis/analysis/gnm.py +++ b/package/MDAnalysis/analysis/gnm.py @@ -174,16 +174,18 @@ class GNMAnalysis(object): filename to write eigenvectors to, by default no output is written (``None``) Bonus_groups : tuple - This is a tuple of selection groups, the com of each which will be - added as a single point in the ENM (its a popular way of treating - small ligands such as drugs) but be careful that it doesn't have any - atoms which can be caught by the global selection as this could lead - to double counting. Default is ``None``. + This is a tuple of selection strings that identify additional groups + (such as ligands). The center of mass of each group will be added as + a single point in the ENM (it is a popular way of treating small + ligands such as drugs). You need to ensure that none of the atoms in + `Bonus_groups` is contained in `selection` as this could lead to + double counting. No checks are applied. Default is ``None``. .. SeeAlso:: :class:`closeContactGNMAnalysis` .. versionchanged:: 0.16.0 Made :meth:`generate_output` a private method :meth:`_generate_output`. + """ def __init__(self, universe, selection='protein and name CA', cutoff=7.0, @@ -338,7 +340,8 @@ class closeContactGNMAnalysis(GNMAnalysis): .. Note:: This option does not perform a true mass weighting but weighting by the number of atoms in each residue; the name of the parameter exists for historical reasons and will - be removed in 0.17.0. + be removed in 0.17.0. Until then, setting `MassWeight` to + anything but ``None`` will override `weights`. .. SeeAlso:: :class:`GNMAnalysis` @@ -350,16 +353,13 @@ class closeContactGNMAnalysis(GNMAnalysis): """ def __init__(self, universe, selection='protein', cutoff=4.5, ReportVector=None, - weights="size", - MassWeight=None): + weights="size", MassWeight=None): self.u = universe self.selection = selection self.cutoff = cutoff self.results = [] # final result self._timesteps = None # time for each frame self.ReportVector = ReportVector - # no bonus groups in this version of the GNM analysis tool; this is because this version doesn't use CA atoms - # or centroids, and so doesn't need them self.ca = self.u.select_atoms(self.selection) self.weights = weights @@ -381,7 +381,8 @@ def generate_kirchoff(self): cutoffsq = self.cutoff ** 2 # cache sqrt of residue sizes (slow) so that sr[i]*sr[j] == sqrt(r[i]*r[j]) - sqrt_res_sizes = np.sqrt([r.atoms.n_atoms for r in self.ca.residues]) if self.weights == "size" else None + sqrt_res_sizes = np.sqrt([r.atoms.n_atoms for r in self.ca.residues]) \ + if self.weights == "size" else None for icounter in range(natoms): neighbour_atoms = [] @@ -398,7 +399,8 @@ def generate_kirchoff(self): for jcounter in neighbour_atoms: if jcounter > icounter and _dsq(positions[icounter], positions[jcounter]) <= cutoffsq: iresidue, jresidue = residue_index_map[icounter], residue_index_map[jcounter] - contact = 1.0 / (sqrt_res_sizes[iresidue] * sqrt_res_sizes[jresidue]) if self.weights == "size" else 1.0 + contact = 1.0 / (sqrt_res_sizes[iresidue] * sqrt_res_sizes[jresidue]) \ + if self.weights == "size" else 1.0 matrix[iresidue][jresidue] -= contact matrix[jresidue][iresidue] -= contact matrix[iresidue][iresidue] += contact