From 7af1162c7a9f2c83a5641accc3ccbe294e2f09e1 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 14 Jun 2020 12:49:25 +0100 Subject: [PATCH 01/14] modified AtomNames topologyattr to include lookup table index --- package/MDAnalysis/core/topologyattrs.py | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index c600ada3eb1..4520ff1fafe 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -483,10 +483,59 @@ class Atomnames(AtomAttr): dtype = object transplants = defaultdict(list) + def __init__(self, vals, guessed=False): + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + @staticmethod def _gen_initial_values(na, nr, ns): return np.array(['' for _ in range(na)], dtype=object) + @_check_length + def set_atoms(self, ag, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, str): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[ag.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'): """Select AtomGroup corresponding to the phi protein backbone dihedral C'-N-CA-C. From b45d16566575e35a25e6b912175a9840c60fd357 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 14 Jun 2020 13:18:55 +0100 Subject: [PATCH 02/14] cheeky little optimisation --- package/MDAnalysis/core/selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index bdb156ff249..03687298c89 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -530,9 +530,9 @@ def __init__(self, parser, tokens): @return_empty_on_apply def apply(self, group): - mask = np.zeros(len(group), dtype=bool) + mask = np.zeros(len(group), dtype=np.bool) + values = getattr(group, self.field) for val in self.values: - values = getattr(group, self.field) mask |= [fnmatch.fnmatch(x, val) for x in values] return group[mask].unique From 76135eb320d16b18b3e75a7f0a11e2d5df9e0057 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 14 Jun 2020 14:11:37 +0100 Subject: [PATCH 03/14] rework atom name selection to use lookup tables --- package/MDAnalysis/core/selection.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 03687298c89..543cab5f4d8 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -542,6 +542,21 @@ class AtomNameSelection(StringSelection): token = 'name' field = 'names' + def apply(self, group): + # rather than work on group.names, cheat and look at the lookup table + nmattr = group.universe._topology.names + + matches = [] # list of passing indices + # iterate through set of known atom names, check which pass + for nm, ix in nmattr.namedict.items(): + if any(fnmatch.fnmatch(nm, val) for val in self.values): + matches.append(ix) + + # atomname indices for members of this group + nmidx = nmattr.nmidx[group.ix] + + return group[np.in1d(nmidx, matches)].unique + class AtomTypeSelection(StringSelection): """Select atoms based on 'types' attribute""" From 17fe547f03987464ca9fdc7570a66726a35d2b1c Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Mon, 15 Jun 2020 00:08:08 -0700 Subject: [PATCH 04/14] Update topologyattrs.py --- package/MDAnalysis/core/topologyattrs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index 4520ff1fafe..caacf7657d9 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -484,6 +484,8 @@ class Atomnames(AtomAttr): transplants = defaultdict(list) def __init__(self, vals, guessed=False): + self._guessed = guessed + self.namedict = dict() # maps str to nmidx name_lookup = [] # maps idx to str # eg namedict['O'] = 5 & name_lookup[5] = 'O' From a4364e2f8cc96f449ba202396b701db151f15939 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 16 Jun 2020 11:12:22 +0100 Subject: [PATCH 05/14] fixed test supplying integer as atom name really topologyattrs need to be statically typed and protective about this --- testsuite/MDAnalysisTests/core/test_topologyattrs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index 6fa082b3b3f..b4c2dc14f23 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -93,6 +93,7 @@ class TestAtomAttr(TopologyAttrMixin): """ values = np.array([7, 3, 69, 9993, 84, 194, 263, 501, 109, 5873]) + single_value = 567 attrclass = tpattrs.AtomAttr def test_set_atom_VE(self): @@ -112,7 +113,7 @@ def test_get_atoms(self, attr): def test_set_atoms_singular(self, attr): # set len 2 Group to len 1 value dg = DummyGroup([3, 7]) - attr.set_atoms(dg, 567) + attr.set_atoms(dg, self.single_value) assert_equal(attr.get_atoms(dg), np.array([567, 567])) def test_set_atoms_plural(self, attr): @@ -175,6 +176,7 @@ def test_cant_set_segment_indices(self, u): class TestAtomnames(TestAtomAttr): values = np.array(['O', 'C', 'CA', 'N', 'CB', 'CG', 'CD', 'NA', 'CL', 'OW'], dtype=np.object) + single_value = 'Ca2' attrclass = tpattrs.Atomnames From b7b20b37baabdbe57fbe072eae055a6396cdee2e Mon Sep 17 00:00:00 2001 From: Richard Gowers Date: Tue, 16 Jun 2020 14:19:06 +0100 Subject: [PATCH 06/14] Update test_topologyattrs.py --- testsuite/MDAnalysisTests/core/test_topologyattrs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index b4c2dc14f23..e9e68d8717f 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -114,7 +114,8 @@ def test_set_atoms_singular(self, attr): # set len 2 Group to len 1 value dg = DummyGroup([3, 7]) attr.set_atoms(dg, self.single_value) - assert_equal(attr.get_atoms(dg), np.array([567, 567])) + assert_equal(attr.get_atoms(dg), + np.array([self.single_value, self.single_value])) def test_set_atoms_plural(self, attr): # set len 2 Group to len 2 values From 5611f967c7aaedacd8729439d2b1da0af98f8fd5 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 21 Jun 2020 20:59:06 +0100 Subject: [PATCH 07/14] use dict-lookup string attrs EVERYWHERERE --- package/MDAnalysis/core/selection.py | 57 +++++--- package/MDAnalysis/core/topologyattrs.py | 166 ++++++++++++++++++----- 2 files changed, 176 insertions(+), 47 deletions(-) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 543cab5f4d8..5316c75fbaf 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -529,22 +529,9 @@ def __init__(self, parser, tokens): self.values = vals @return_empty_on_apply - def apply(self, group): - mask = np.zeros(len(group), dtype=np.bool) - values = getattr(group, self.field) - for val in self.values: - mask |= [fnmatch.fnmatch(x, val) for x in values] - return group[mask].unique - - -class AtomNameSelection(StringSelection): - """Select atoms based on 'names' attribute""" - token = 'name' - field = 'names' - def apply(self, group): # rather than work on group.names, cheat and look at the lookup table - nmattr = group.universe._topology.names + nmattr = getattr(group.universe._topology, self.field) matches = [] # list of passing indices # iterate through set of known atom names, check which pass @@ -558,6 +545,12 @@ def apply(self, group): return group[np.in1d(nmidx, matches)].unique +class AtomNameSelection(StringSelection): + """Select atoms based on 'names' attribute""" + token = 'name' + field = 'names' + + class AtomTypeSelection(StringSelection): """Select atoms based on 'types' attribute""" token = 'type' @@ -576,13 +569,30 @@ class AtomICodeSelection(StringSelection): field = 'icodes' -class ResidueNameSelection(StringSelection): +class _ResidueStringSelection(StringSelection): + def apply(self, group): + # rather than work on group.names, cheat and look at the lookup table + nmattr = getattr(group.universe._topology, self.field) + + matches = [] # list of passing indices + # iterate through set of known atom names, check which pass + for nm, ix in nmattr.namedict.items(): + if any(fnmatch.fnmatch(nm, val) for val in self.values): + matches.append(ix) + + # atomname indices for members of this group + nmidx = nmattr.nmidx[group.resindices] + + return group[np.in1d(nmidx, matches)].unique + + +class ResidueNameSelection(_ResidueStringSelection): """Select atoms based on 'resnames' attribute""" token = 'resname' field = 'resnames' -class MoleculeTypeSelection(StringSelection): +class MoleculeTypeSelection(_ResidueStringSelection): """Select atoms based on 'moltypes' attribute""" token = 'moltype' field = 'moltypes' @@ -593,6 +603,21 @@ class SegmentNameSelection(StringSelection): token = 'segid' field = 'segids' + def apply(self, group): + # rather than work on group.names, cheat and look at the lookup table + nmattr = group.universe._topology.segids + + matches = [] # list of passing indices + # iterate through set of known atom names, check which pass + for nm, ix in nmattr.namedict.items(): + if any(fnmatch.fnmatch(nm, val) for val in self.values): + matches.append(ix) + + # atomname indices for members of this group + nmidx = nmattr.nmidx[group.segindices] + + return group[np.in1d(nmidx, matches)].unique + class AltlocSelection(StringSelection): """Select atoms based on 'altLoc' attribute""" diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py index caacf7657d9..a591f09cb3a 100644 --- a/package/MDAnalysis/core/topologyattrs.py +++ b/package/MDAnalysis/core/topologyattrs.py @@ -473,16 +473,7 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, na + 1) -# TODO: update docs to property doc -class Atomnames(AtomAttr): - """Name for each atom. - """ - attrname = 'names' - singular = 'name' - per_object = 'atom' - dtype = object - transplants = defaultdict(list) - +class _AtomStringAttr(AtomAttr): def __init__(self, vals, guessed=False): self._guessed = guessed @@ -538,6 +529,17 @@ def set_atoms(self, ag, values): self.name_lookup = np.concatenate([self.name_lookup, newnames]) self.values = self.name_lookup[self.nmidx] + +# TODO: update docs to property doc +class Atomnames(_AtomStringAttr): + """Name for each atom. + """ + attrname = 'names' + singular = 'name' + per_object = 'atom' + dtype = object + transplants = defaultdict(list) + def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'): """Select AtomGroup corresponding to the phi protein backbone dihedral C'-N-CA-C. @@ -1009,20 +1011,16 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB', # TODO: update docs to property doc -class Atomtypes(AtomAttr): +class Atomtypes(_AtomStringAttr): """Type for each atom""" attrname = 'types' singular = 'type' per_object = 'atom' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(na)], dtype=object) - # TODO: update docs to property doc -class Elements(AtomAttr): +class Elements(_AtomStringAttr): """Element for each atom""" attrname = 'elements' singular = 'element' @@ -1046,7 +1044,7 @@ def _gen_initial_values(na, nr, ns): return np.zeros(na) -class RecordTypes(AtomAttr): +class RecordTypes(_AtomStringAttr): """For PDB-like formats, indicates if ATOM or HETATM Defaults to 'ATOM' @@ -1064,7 +1062,7 @@ def _gen_initial_values(na, nr, ns): return np.array(['ATOM'] * na, dtype=object) -class ChainIDs(AtomAttr): +class ChainIDs(_AtomStringAttr): """ChainID per atom Note @@ -1076,10 +1074,6 @@ class ChainIDs(AtomAttr): per_object = 'atom' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(na)], dtype=object) - class Tempfactors(AtomAttr): """Tempfactor for atoms""" @@ -1625,7 +1619,7 @@ def _gen_initial_values(na, nr, ns): # TODO: update docs to property doc -class AltLocs(AtomAttr): +class AltLocs(_AtomStringAttr): """AltLocs for each atom""" attrname = 'altLocs' singular = 'altLoc' @@ -1778,8 +1772,65 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, nr + 1) +class _ResidueStringAttr(ResidueAttr): + def __init__(self, vals, guessed=False): + self._guessed = guessed + + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['' for _ in range(nr)], dtype=object) + + @_check_length + def set_residues(self, rg, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, str): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[rg.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + + # TODO: update docs to property doc -class Resnames(ResidueAttr): +class Resnames(_ResidueStringAttr): attrname = 'resnames' singular = 'resname' transplants = defaultdict(list) @@ -1898,18 +1949,14 @@ def _gen_initial_values(na, nr, ns): return np.arange(1, nr + 1) -class ICodes(ResidueAttr): +class ICodes(_ResidueStringAttr): """Insertion code for Atoms""" attrname = 'icodes' singular = 'icode' dtype = object - @staticmethod - def _gen_initial_values(na, nr, ns): - return np.array(['' for _ in range(nr)], dtype=object) - -class Moltypes(ResidueAttr): +class Moltypes(_ResidueStringAttr): """Name of the molecule type Two molecules that share a molecule type share a common template topology. @@ -1961,8 +2008,65 @@ def set_segments(self, sg, values): self.values[sg.ix] = values +class _SegmentStringAttr(SegmentAttr): + def __init__(self, vals, guessed=False): + self._guessed = guessed + + self.namedict = dict() # maps str to nmidx + name_lookup = [] # maps idx to str + # eg namedict['O'] = 5 & name_lookup[5] = 'O' + + self.nmidx = np.zeros_like(vals, dtype=int) # the lookup for each atom + # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C' + + for i, val in enumerate(vals): + try: + self.nmidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + name_lookup.append(val) + + self.nmidx[i] = nextidx + + self.name_lookup = np.array(name_lookup, dtype=object) + self.values = self.name_lookup[self.nmidx] + + @staticmethod + def _gen_initial_values(na, nr, ns): + return np.array(['' for _ in range(nr)], dtype=object) + + @_check_length + def set_segments(self, sg, values): + newnames = [] + + # two possibilities, either single value given, or one per Atom + if isinstance(values, str): + try: + newidx = self.namedict[values] + except KeyError: + newidx = len(self.namedict) + self.namedict[values] = newidx + newnames.append(values) + else: + newidx = np.zeros_like(values, dtype=int) + for i, val in enumerate(values): + try: + newidx[i] = self.namedict[val] + except KeyError: + nextidx = len(self.namedict) + self.namedict[val] = nextidx + newnames.append(val) + newidx[i] = nextidx + + self.nmidx[sg.ix] = newidx # newidx either single value or same size array + if newnames: + self.name_lookup = np.concatenate([self.name_lookup, newnames]) + self.values = self.name_lookup[self.nmidx] + + # TODO: update docs to property doc -class Segids(SegmentAttr): +class Segids(_SegmentStringAttr): attrname = 'segids' singular = 'segid' transplants = defaultdict(list) From 6a6636131eddba234f749b634e065889c978c611 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Jul 2020 11:06:06 +0100 Subject: [PATCH 08/14] removed some code duplication made protein selection faster, 48ms -> 0.5ms on GRO testfile --- package/CHANGELOG | 2 + package/MDAnalysis/core/selection.py | 57 ++++++++++------------------ 2 files changed, 22 insertions(+), 37 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index e5e506d82a1..6325737a0b0 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -58,6 +58,8 @@ Enhancements * Added Hydrogen Bond Lifetime keyword "between" (PR #2791) * Dead code removed from the TPR parser and increased test coverage (PR #2840) * TPR parser exposes the elements topology attribute (PR #2858, see Issue #2553) + * Improved performance of select_atoms on strings (e.g. name, type, resname) and + 'protein' selection (#2751 PR #2755) Changes * deprecated NumPy type aliases have been replaced with their actual types diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 5316c75fbaf..3282b4425b2 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -515,7 +515,7 @@ def apply(self, group): return group[mask] -class StringSelection(Selection): +class _ProtoStringSelection(Selection): """Selections based on text attributes .. versionchanged:: 1.0.0 @@ -540,11 +540,15 @@ def apply(self, group): matches.append(ix) # atomname indices for members of this group - nmidx = nmattr.nmidx[group.ix] + nmidx = nmattr.nmidx[getattr(group, self.level)] return group[np.in1d(nmidx, matches)].unique +class StringSelection(_ProtoStringSelection): + level = 'ix' # operates on atom level attribute, i.e. '.ix' + + class AtomNameSelection(StringSelection): """Select atoms based on 'names' attribute""" token = 'name' @@ -569,21 +573,8 @@ class AtomICodeSelection(StringSelection): field = 'icodes' -class _ResidueStringSelection(StringSelection): - def apply(self, group): - # rather than work on group.names, cheat and look at the lookup table - nmattr = getattr(group.universe._topology, self.field) - - matches = [] # list of passing indices - # iterate through set of known atom names, check which pass - for nm, ix in nmattr.namedict.items(): - if any(fnmatch.fnmatch(nm, val) for val in self.values): - matches.append(ix) - - # atomname indices for members of this group - nmidx = nmattr.nmidx[group.resindices] - - return group[np.in1d(nmidx, matches)].unique +class _ResidueStringSelection(_ProtoStringSelection): + level= 'resindices' class ResidueNameSelection(_ResidueStringSelection): @@ -598,25 +589,11 @@ class MoleculeTypeSelection(_ResidueStringSelection): field = 'moltypes' -class SegmentNameSelection(StringSelection): +class SegmentNameSelection(_ProtoStringSelection): """Select atoms based on 'segids' attribute""" token = 'segid' field = 'segids' - - def apply(self, group): - # rather than work on group.names, cheat and look at the lookup table - nmattr = group.universe._topology.segids - - matches = [] # list of passing indices - # iterate through set of known atom names, check which pass - for nm, ix in nmattr.namedict.items(): - if any(fnmatch.fnmatch(nm, val) for val in self.values): - matches.append(ix) - - # atomname indices for members of this group - nmidx = nmattr.nmidx[group.segindices] - - return group[np.in1d(nmidx, matches)].unique + level = 'segindices' class AltlocSelection(StringSelection): @@ -845,7 +822,7 @@ class ProteinSelection(Selection): """ token = 'protein' - prot_res = np.array([ + prot_res = { # CHARMM top_all27_prot_lipid.rtf 'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD', 'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', @@ -868,14 +845,20 @@ class ProteinSelection(Selection): 'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE', 'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS', 'CCYX', 'CMET', 'CME', 'ASF', - ]) + } def __init__(self, parser, tokens): pass def apply(self, group): - mask = np.in1d(group.resnames, self.prot_res) - return group[mask].unique + resname_attr = group.universe._topology.resnames + # which values in resname attr are in prot_res? + matches = [ix for (nm, ix) in resname_attr.namedict.items() + if nm in self.prot_res] + # index of each atom's resname + nmidx = resname_attr.nmidx[group.resindices] + # intersect atom's resname index and matches to prot_res + return group[np.in1d(nmidx, matches)].unique class NucleicSelection(Selection): From 3c4dc3265310a312930f79d9b8025a4598cd4f61 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 11 Jul 2020 16:38:49 +0100 Subject: [PATCH 09/14] improved nucleic/backbone selections --- package/CHANGELOG | 1 + package/MDAnalysis/core/selection.py | 119 +++++++++++++++--- .../core/test_atomselections.py | 2 +- 3 files changed, 102 insertions(+), 20 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 6325737a0b0..80e8161344a 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -39,6 +39,7 @@ Fixes * In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError was thrown when finding D-H pairs via the topology if `hydrogens` was an empty AtomGroup (Issue #2848) + * Fixed performance regression on select_atoms for string selections (#2751) Enhancements * Refactored analysis.helanal into analysis.helix_analysis diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 3282b4425b2..82f28e89980 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -819,6 +819,10 @@ class ProteinSelection(Selection): See Also -------- :func:`MDAnalysis.lib.util.convert_aa_code` + + .. versionchanged:: 2.0.0 + prot_res changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'protein' @@ -873,23 +877,32 @@ class NucleicSelection(Selection): .. versionchanged:: 0.8 additional Gromacs selections + .. versionchanged:: 2.0.0 + nucl_res changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleic' - nucl_res = np.array([ + nucl_res = { 'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA', 'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G', 'DA5', 'DC5', 'DG5', 'DT5', 'DA3', 'DC3', 'DG3', 'DT3', 'RA5', 'RU5', 'RG5', 'RC5', 'RA3', 'RU3', 'RG3', 'RC3' - ]) + } def __init__(self, parser, tokens): pass def apply(self, group): - mask = np.in1d(group.resnames, self.nucl_res) + resnames = group.universe._topology.resnames + nmidx = resnames.nmidx[group.resindices] + + matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + mask = np.in1d(nmidx, matches) + return group[mask].unique @@ -898,14 +911,31 @@ class BackboneSelection(ProteinSelection): This excludes OT* on C-termini (which are included by, eg VMD's backbone selection). + + .. versionchanged:: 2.0.0 + bb_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'backbone' - bb_atoms = np.array(['N', 'CA', 'C', 'O']) + bb_atoms = {'N', 'CA', 'C', 'O'} def apply(self, group): - mask = np.in1d(group.names, self.bb_atoms) - mask &= np.in1d(group.resnames, self.prot_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.bb_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.prot_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class NucleicBackboneSelection(NucleicSelection): @@ -913,14 +943,31 @@ class NucleicBackboneSelection(NucleicSelection): These atoms are only recognized if they are in a residue matched by the :class:`NucleicSelection`. + + .. versionchanged:: 2.0.0 + bb_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicbackbone' - bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"]) + bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"} def apply(self, group): - mask = np.in1d(group.names, self.bb_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.bb_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class BaseSelection(NucleicSelection): @@ -930,29 +977,63 @@ class BaseSelection(NucleicSelection): 'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6', 'O6','N2','N6', 'O2','N4','O4','C5M' + + .. versionchanged:: 2.0.0 + base_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicbase' - base_atoms = np.array([ + base_atoms = { 'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6', 'O6', 'N2', 'N6', - 'O2', 'N4', 'O4', 'C5M']) + 'O2', 'N4', 'O4', 'C5M'} def apply(self, group): - mask = np.in1d(group.names, self.base_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.base_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class NucleicSugarSelection(NucleicSelection): """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'. + + .. versionchanged:: 2.0.0 + sug_atoms changed to set (from numpy array) + performance improved by ~100x on larger systems """ token = 'nucleicsugar' sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"]) def apply(self, group): - mask = np.in1d(group.names, self.sug_atoms) - mask &= np.in1d(group.resnames, self.nucl_res) - return group[mask].unique + atomnames = group.universe._topology.names + resnames = group.universe._topology.resnames + + # filter by atom names + name_matches = [ix for (nm, ix) in atomnames.namedict.items() + if nm in self.sug_atoms] + nmidx = atomnames.nmidx[group.ix] + group = group[np.in1d(nmidx, name_matches)] + + # filter by resnames + resname_matches = [ix for (nm, ix) in resnames.namedict.items() + if nm in self.nucl_res] + nmidx = resnames.nmidx[group.resindices] + group = group[np.in1d(nmidx, resname_matches)] + + return group.unique class PropertySelection(Selection): diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py index b7d0f515b7f..4042bb84820 100644 --- a/testsuite/MDAnalysisTests/core/test_atomselections.py +++ b/testsuite/MDAnalysisTests/core/test_atomselections.py @@ -74,7 +74,7 @@ def test_protein(self, universe): sorted(universe.select_atoms('segid 4AKE').indices), "selected protein is not the same as auto-generated protein segment s4AKE") - @pytest.mark.parametrize('resname', MDAnalysis.core.selection.ProteinSelection.prot_res) + @pytest.mark.parametrize('resname', sorted(MDAnalysis.core.selection.ProteinSelection.prot_res)) def test_protein_resnames(self, resname): u = make_Universe(('resnames',)) # set half the residues' names to the resname we're testing From 07757a50abee3d94ebb73932d43547c0a28de10c Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Jul 2020 09:07:32 +0100 Subject: [PATCH 10/14] Added explicit tests for Resnames topologyattr tests now provide str types for resnames/icodes --- .../core/test_topologyattrs.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py index e9e68d8717f..270491514af 100644 --- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py +++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py @@ -209,18 +209,19 @@ class TestResidueAttr(TopologyAttrMixin): """Test residue-level TopologyAttrs. """ + single_value = 2 values = np.array([15.2, 395.6, 0.1, 9.8]) attrclass = tpattrs.ResidueAttr - def test_set_residue_VE(self): - u = make_Universe(('resnames',)) - res = u.residues[0] + def test_set_residue_VE(self, universe): + # setting e.g. resname to 2 values should fail with VE + res = universe.residues[0] with pytest.raises(ValueError): - setattr(res, 'resname', ['wrong', 'length']) + setattr(res, self.attrclass.singular, self.values[:2]) def test_get_atoms(self, attr): assert_equal(attr.get_atoms(DummyGroup([7, 3, 9])), - self.values[[3, 2, 2]]) + self.values[[3, 2, 2]]) def test_get_atom(self, universe): attr = getattr(universe.atoms[0], self.attrclass.singular) @@ -228,14 +229,14 @@ def test_get_atom(self, universe): def test_get_residues(self, attr): assert_equal(attr.get_residues(DummyGroup([1, 2, 1, 3])), - self.values[[1, 2, 1, 3]]) + self.values[[1, 2, 1, 3]]) def test_set_residues_singular(self, attr): dg = DummyGroup([3, 0, 1]) - attr.set_residues(dg, 2) + attr.set_residues(dg, self.single_value) - assert_almost_equal(attr.get_residues(dg), - np.array([2, 2, 2])) + assert_equal(attr.get_residues(dg), + np.array([self.single_value]*3, dtype=self.values.dtype)) def test_set_residues_plural(self, attr): attr.set_residues(DummyGroup([3, 0, 1]), @@ -257,10 +258,17 @@ def test_get_segments(self, attr): assert_equal(attr.get_segments(DummyGroup([0, 1, 1])), [self.values[[0, 3]], self.values[[1, 2]], self.values[[1, 2]]]) -class TestICodes(TestResidueAttr): - values = np.array(['a', 'b', '', 'd']) + +class TestResnames(TestResidueAttr): + attrclass = tpattrs.Resnames + single_value = 'xyz' + values = np.array(['a', 'b', '', 'd'], dtype=object) + + +class TestICodes(TestResnames): attrclass = tpattrs.ICodes + class TestResids(TestResidueAttr): values = np.array([10, 11, 18, 20]) attrclass = tpattrs.Resids From 95f4d15a4a8851f48f5b794a5f5e239e183a28b0 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Jul 2020 09:09:16 +0100 Subject: [PATCH 11/14] use fnmatchcase to be case sensitive --- package/MDAnalysis/core/selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 82f28e89980..fa38ff474b3 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -536,7 +536,7 @@ def apply(self, group): matches = [] # list of passing indices # iterate through set of known atom names, check which pass for nm, ix in nmattr.namedict.items(): - if any(fnmatch.fnmatch(nm, val) for val in self.values): + if any(fnmatch.fnmatchcase(nm, val) for val in self.values): matches.append(ix) # atomname indices for members of this group From 5ae8edd4c0b0a76bddeb7f5aa28ecaa8ad6209d8 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Sat, 15 Aug 2020 23:29:53 -0700 Subject: [PATCH 12/14] Update package/MDAnalysis/core/selection.py @jbarnoud's fix --- package/MDAnalysis/core/selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index fa38ff474b3..47e51d1a858 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -1015,7 +1015,7 @@ class NucleicSugarSelection(NucleicSelection): performance improved by ~100x on larger systems """ token = 'nucleicsugar' - sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"]) + sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"} def apply(self, group): atomnames = group.universe._topology.names From 640c411285dbfdae44161739ed1ff842a8e06560 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Tue, 18 Aug 2020 23:37:57 -0700 Subject: [PATCH 13/14] apply suggestions from code review Co-authored-by: Irfan Alibay --- package/MDAnalysis/core/selection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index 47e51d1a858..dc8747832ea 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -820,6 +820,7 @@ class ProteinSelection(Selection): -------- :func:`MDAnalysis.lib.util.convert_aa_code` + .. versionchanged:: 2.0.0 prot_res changed to set (from numpy array) performance improved by ~100x on larger systems @@ -912,6 +913,7 @@ class BackboneSelection(ProteinSelection): This excludes OT* on C-termini (which are included by, eg VMD's backbone selection). + .. versionchanged:: 2.0.0 bb_atoms changed to set (from numpy array) performance improved by ~100x on larger systems @@ -944,6 +946,7 @@ class NucleicBackboneSelection(NucleicSelection): These atoms are only recognized if they are in a residue matched by the :class:`NucleicSelection`. + .. versionchanged:: 2.0.0 bb_atoms changed to set (from numpy array) performance improved by ~100x on larger systems @@ -978,6 +981,7 @@ class BaseSelection(NucleicSelection): 'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6', 'O6','N2','N6', 'O2','N4','O4','C5M' + .. versionchanged:: 2.0.0 base_atoms changed to set (from numpy array) performance improved by ~100x on larger systems @@ -1010,6 +1014,7 @@ def apply(self, group): class NucleicSugarSelection(NucleicSelection): """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'. + .. versionchanged:: 2.0.0 sug_atoms changed to set (from numpy array) performance improved by ~100x on larger systems From ee64c32bc3db2575301166e688b9de038e0197e4 Mon Sep 17 00:00:00 2001 From: richardjgowers Date: Tue, 25 Aug 2020 11:30:09 +0100 Subject: [PATCH 14/14] added test for setting multiple segids at once --- .../MDAnalysisTests/core/test_segmentgroup.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/testsuite/MDAnalysisTests/core/test_segmentgroup.py b/testsuite/MDAnalysisTests/core/test_segmentgroup.py index 3f0c251e543..546c5ad44cf 100644 --- a/testsuite/MDAnalysisTests/core/test_segmentgroup.py +++ b/testsuite/MDAnalysisTests/core/test_segmentgroup.py @@ -88,6 +88,24 @@ def test_set_segid_updates_(universe): err_msg="old selection was not changed in place after set_segid") +def test_set_segids_many(): + u = mda.Universe.empty(n_atoms=6, n_residues=2, n_segments=2, + atom_resindex=[0, 0, 0, 1, 1, 1], residue_segindex=[0,1]) + u.add_TopologyAttr('segids', ['A', 'B']) + + # universe with 2 segments, A and B + + u.segments.segids = ['X', 'Y'] + + assert u.segments[0].segid == 'X' + assert u.segments[1].segid == 'Y' + + assert len(u.select_atoms('segid A')) == 0 + assert len(u.select_atoms('segid B')) == 0 + assert len(u.select_atoms('segid X')) == 3 + assert len(u.select_atoms('segid Y')) == 3 + + def test_atom_order(universe): assert_equal(universe.segments.atoms.indices, sorted(universe.segments.atoms.indices))