MDAnalysis · richardjgowers · Aug 25, 2020 · Jun 14, 2020 · Jun 14, 2020 · Jun 14, 2020
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -45,6 +45,7 @@ Fixes
   * In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError
     was thrown when finding D-H pairs via the topology if `hydrogens` was an
     empty AtomGroup (Issue #2848)
+  * Fixed performance regression on select_atoms for string selections (#2751)
   * Fixed the DMSParser, allowing the creation of multiple segids sharing
     residues with identical resids (Issue #1387, PR #2872)
   * H5MD files are now picklable with H5PYPicklable (Issue #2890, PR #2894)
@@ -79,6 +80,8 @@ Enhancements
   * Added new kwargs `select_remove` and `select_protein` to
     analysis.dihedrals.Janin analysis to give user more fine grained control
     over selections (PR #2899)
+  * Improved performance of select_atoms on strings (e.g. name, type, resname) and
+    'protein' selection (#2751 PR #2755)
   * Added an RDKit converter that works for any input with all hydrogens
     explicit in the topology (Issue #2468, PR #2775)
 

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
@@ -515,7 +515,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -530,11 +530,23 @@ def __init__(self, parser, tokens):
 
     @return_empty_on_apply
     def apply(self, group):
-        mask = np.zeros(len(group), dtype=bool)
-        for val in self.values:
-            values = getattr(group, self.field)
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
+
+        return group[np.in1d(nmidx, matches)].unique
+
+
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
 
 
 class AtomNameSelection(StringSelection):
@@ -561,22 +573,27 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -802,10 +819,15 @@ class ProteinSelection(Selection):
     See Also
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
+
+
+    .. versionchanged:: 2.0.0
+       prot_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -828,14 +850,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):
@@ -850,23 +878,32 @@ class NucleicSelection(Selection):
 
     .. versionchanged:: 0.8
        additional Gromacs selections
+    .. versionchanged:: 2.0.0
+       nucl_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleic'
 
-    nucl_res = np.array([
+    nucl_res = {
         'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
         'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
         'DA5', 'DC5', 'DG5', 'DT5',
         'DA3', 'DC3', 'DG3', 'DT3',
         'RA5', 'RU5', 'RG5', 'RC5',
         'RA3', 'RU3', 'RG3', 'RC3'
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.nucl_res)
+        resnames = group.universe._topology.resnames
+        nmidx = resnames.nmidx[group.resindices]
+
+        matches = [ix for (nm, ix) in resnames.namedict.items()
+                   if nm in self.nucl_res]
+        mask = np.in1d(nmidx, matches)
+
         return group[mask].unique
 
 
@@ -875,29 +912,65 @@ class BackboneSelection(ProteinSelection):
 
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
+
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'backbone'
-    bb_atoms = np.array(['N', 'CA', 'C', 'O'])
+    bb_atoms = {'N', 'CA', 'C', 'O'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.prot_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicBackboneSelection(NucleicSelection):
     """Contains all atoms with name "P", "C5'", C3'", "O3'", "O5'".
 
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
+
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbackbone'
-    bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
+    bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class BaseSelection(NucleicSelection):
@@ -907,29 +980,65 @@ class BaseSelection(NucleicSelection):
 
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
+
+
+    .. versionchanged:: 2.0.0
+       base_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbase'
-    base_atoms = np.array([
+    base_atoms = {
         'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
         'O6', 'N2', 'N6',
-        'O2', 'N4', 'O4', 'C5M'])
+        'O2', 'N4', 'O4', 'C5M'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.base_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.base_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
+
+
+    .. versionchanged:: 2.0.0
+       sug_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
-    sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
+    sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.sug_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.sug_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class PropertySelection(Selection):