Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Fixes
* In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError
was thrown when finding D-H pairs via the topology if `hydrogens` was an
empty AtomGroup (Issue #2848)
* Fixed performance regression on select_atoms for string selections (#2751)
* Fixed the DMSParser, allowing the creation of multiple segids sharing
residues with identical resids (Issue #1387, PR #2872)
* H5MD files are now picklable with H5PYPicklable (Issue #2890, PR #2894)
Expand Down Expand Up @@ -79,6 +80,8 @@ Enhancements
* Added new kwargs `select_remove` and `select_protein` to
analysis.dihedrals.Janin analysis to give user more fine grained control
over selections (PR #2899)
* Improved performance of select_atoms on strings (e.g. name, type, resname) and
'protein' selection (#2751 PR #2755)
* Added an RDKit converter that works for any input with all hydrogens
explicit in the topology (Issue #2468, PR #2775)

Expand Down
175 changes: 142 additions & 33 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def apply(self, group):
return group[mask]


class StringSelection(Selection):
class _ProtoStringSelection(Selection):
"""Selections based on text attributes

.. versionchanged:: 1.0.0
Expand All @@ -530,11 +530,23 @@ def __init__(self, parser, tokens):

@return_empty_on_apply
def apply(self, group):
mask = np.zeros(len(group), dtype=bool)
for val in self.values:
values = getattr(group, self.field)
mask |= [fnmatch.fnmatch(x, val) for x in values]
return group[mask].unique
# rather than work on group.names, cheat and look at the lookup table
nmattr = getattr(group.universe._topology, self.field)

matches = [] # list of passing indices
# iterate through set of known atom names, check which pass
for nm, ix in nmattr.namedict.items():
if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
matches.append(ix)

# atomname indices for members of this group
nmidx = nmattr.nmidx[getattr(group, self.level)]

return group[np.in1d(nmidx, matches)].unique


class StringSelection(_ProtoStringSelection):
level = 'ix' # operates on atom level attribute, i.e. '.ix'


class AtomNameSelection(StringSelection):
Expand All @@ -561,22 +573,27 @@ class AtomICodeSelection(StringSelection):
field = 'icodes'


class ResidueNameSelection(StringSelection):
class _ResidueStringSelection(_ProtoStringSelection):
level= 'resindices'


class ResidueNameSelection(_ResidueStringSelection):
"""Select atoms based on 'resnames' attribute"""
token = 'resname'
field = 'resnames'


class MoleculeTypeSelection(StringSelection):
class MoleculeTypeSelection(_ResidueStringSelection):
"""Select atoms based on 'moltypes' attribute"""
token = 'moltype'
field = 'moltypes'


class SegmentNameSelection(StringSelection):
class SegmentNameSelection(_ProtoStringSelection):
"""Select atoms based on 'segids' attribute"""
token = 'segid'
field = 'segids'
level = 'segindices'


class AltlocSelection(StringSelection):
Expand Down Expand Up @@ -802,10 +819,15 @@ class ProteinSelection(Selection):
See Also
--------
:func:`MDAnalysis.lib.util.convert_aa_code`


.. versionchanged:: 2.0.0
prot_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'protein'

prot_res = np.array([
prot_res = {
# CHARMM top_all27_prot_lipid.rtf
'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
Expand All @@ -828,14 +850,20 @@ class ProteinSelection(Selection):
'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
'CCYX', 'CMET', 'CME', 'ASF',
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

select_atoms('protein') went from 50ms to 0.5ms on GRO

mask = np.in1d(group.resnames, self.prot_res)
return group[mask].unique
resname_attr = group.universe._topology.resnames
# which values in resname attr are in prot_res?
matches = [ix for (nm, ix) in resname_attr.namedict.items()
if nm in self.prot_res]
# index of each atom's resname
nmidx = resname_attr.nmidx[group.resindices]
# intersect atom's resname index and matches to prot_res
return group[np.in1d(nmidx, matches)].unique


class NucleicSelection(Selection):
Expand All @@ -850,23 +878,32 @@ class NucleicSelection(Selection):

.. versionchanged:: 0.8
additional Gromacs selections
.. versionchanged:: 2.0.0
nucl_res changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleic'

nucl_res = np.array([
nucl_res = {
'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
'DA5', 'DC5', 'DG5', 'DT5',
'DA3', 'DC3', 'DG3', 'DT3',
'RA5', 'RU5', 'RG5', 'RC5',
'RA3', 'RU3', 'RG3', 'RC3'
])
}

def __init__(self, parser, tokens):
pass

def apply(self, group):
mask = np.in1d(group.resnames, self.nucl_res)
resnames = group.universe._topology.resnames
nmidx = resnames.nmidx[group.resindices]

matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
mask = np.in1d(nmidx, matches)

return group[mask].unique


Expand All @@ -875,29 +912,65 @@ class BackboneSelection(ProteinSelection):

This excludes OT* on C-termini
(which are included by, eg VMD's backbone selection).


.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'backbone'
bb_atoms = np.array(['N', 'CA', 'C', 'O'])
bb_atoms = {'N', 'CA', 'C', 'O'}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.prot_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.prot_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicBackboneSelection(NucleicSelection):
"""Contains all atoms with name "P", "C5'", C3'", "O3'", "O5'".

These atoms are only recognized if they are in a residue matched
by the :class:`NucleicSelection`.


.. versionchanged:: 2.0.0
bb_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbackbone'
bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}

def apply(self, group):
mask = np.in1d(group.names, self.bb_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.bb_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class BaseSelection(NucleicSelection):
Expand All @@ -907,29 +980,65 @@ class BaseSelection(NucleicSelection):

'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6','N2','N6', 'O2','N4','O4','C5M'


.. versionchanged:: 2.0.0
base_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicbase'
base_atoms = np.array([
base_atoms = {
'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
'O6', 'N2', 'N6',
'O2', 'N4', 'O4', 'C5M'])
'O2', 'N4', 'O4', 'C5M'}

def apply(self, group):
mask = np.in1d(group.names, self.base_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.base_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class NucleicSugarSelection(NucleicSelection):
"""Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.


.. versionchanged:: 2.0.0
sug_atoms changed to set (from numpy array)
performance improved by ~100x on larger systems
"""
token = 'nucleicsugar'
sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}

def apply(self, group):
mask = np.in1d(group.names, self.sug_atoms)
mask &= np.in1d(group.resnames, self.nucl_res)
return group[mask].unique
atomnames = group.universe._topology.names
resnames = group.universe._topology.resnames

# filter by atom names
name_matches = [ix for (nm, ix) in atomnames.namedict.items()
if nm in self.sug_atoms]
nmidx = atomnames.nmidx[group.ix]
group = group[np.in1d(nmidx, name_matches)]

# filter by resnames
resname_matches = [ix for (nm, ix) in resnames.namedict.items()
if nm in self.nucl_res]
nmidx = resnames.nmidx[group.resindices]
group = group[np.in1d(nmidx, resname_matches)]

return group.unique


class PropertySelection(Selection):
Expand Down
Loading