From 7af1162c7a9f2c83a5641accc3ccbe294e2f09e1 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 14 Jun 2020 12:49:25 +0100
Subject: [PATCH 01/14] modified AtomNames topologyattr to include lookup table
 index

---
 package/MDAnalysis/core/topologyattrs.py | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index c600ada3eb1..4520ff1fafe 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -483,10 +483,59 @@ class Atomnames(AtomAttr):
     dtype = object
     transplants = defaultdict(list)
 
+    def __init__(self, vals, guessed=False):
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]
+
     @staticmethod
     def _gen_initial_values(na, nr, ns):
         return np.array(['' for _ in range(na)], dtype=object)
 
+    @_check_length
+    def set_atoms(self, ag, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[ag.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]
+
     def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
         """Select AtomGroup corresponding to the phi protein backbone dihedral
         C'-N-CA-C.

From b45d16566575e35a25e6b912175a9840c60fd357 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 14 Jun 2020 13:18:55 +0100
Subject: [PATCH 02/14] cheeky little optimisation

---
 package/MDAnalysis/core/selection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index bdb156ff249..03687298c89 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -530,9 +530,9 @@ def __init__(self, parser, tokens):
 
     @return_empty_on_apply
     def apply(self, group):
-        mask = np.zeros(len(group), dtype=bool)
+        mask = np.zeros(len(group), dtype=np.bool)
+        values = getattr(group, self.field)
         for val in self.values:
-            values = getattr(group, self.field)
             mask |= [fnmatch.fnmatch(x, val) for x in values]
         return group[mask].unique
 

From 76135eb320d16b18b3e75a7f0a11e2d5df9e0057 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 14 Jun 2020 14:11:37 +0100
Subject: [PATCH 03/14] rework atom name selection to use lookup tables

---
 package/MDAnalysis/core/selection.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 03687298c89..543cab5f4d8 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -542,6 +542,21 @@ class AtomNameSelection(StringSelection):
     token = 'name'
     field = 'names'
 
+    def apply(self, group):
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = group.universe._topology.names
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[group.ix]
+
+        return group[np.in1d(nmidx, matches)].unique
+
 
 class AtomTypeSelection(StringSelection):
     """Select atoms based on 'types' attribute"""

From 17fe547f03987464ca9fdc7570a66726a35d2b1c Mon Sep 17 00:00:00 2001
From: Richard Gowers <richardjgowers@gmail.com>
Date: Mon, 15 Jun 2020 00:08:08 -0700
Subject: [PATCH 04/14] Update topologyattrs.py

---
 package/MDAnalysis/core/topologyattrs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index 4520ff1fafe..caacf7657d9 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -484,6 +484,8 @@ class Atomnames(AtomAttr):
     transplants = defaultdict(list)
 
     def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
         self.namedict = dict()  # maps str to nmidx
         name_lookup = []  # maps idx to str
         # eg namedict['O'] = 5 & name_lookup[5] = 'O'

From a4364e2f8cc96f449ba202396b701db151f15939 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Tue, 16 Jun 2020 11:12:22 +0100
Subject: [PATCH 05/14] fixed test supplying integer as atom name

really topologyattrs need to be statically typed and protective about this
---
 testsuite/MDAnalysisTests/core/test_topologyattrs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
index 6fa082b3b3f..b4c2dc14f23 100644
--- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py
+++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
@@ -93,6 +93,7 @@ class TestAtomAttr(TopologyAttrMixin):
 
     """
     values = np.array([7, 3, 69, 9993, 84, 194, 263, 501, 109, 5873])
+    single_value = 567
     attrclass = tpattrs.AtomAttr
 
     def test_set_atom_VE(self):
@@ -112,7 +113,7 @@ def test_get_atoms(self, attr):
     def test_set_atoms_singular(self, attr):
         # set len 2 Group to len 1 value
         dg = DummyGroup([3, 7])
-        attr.set_atoms(dg, 567)
+        attr.set_atoms(dg, self.single_value)
         assert_equal(attr.get_atoms(dg), np.array([567, 567]))
 
     def test_set_atoms_plural(self, attr):
@@ -175,6 +176,7 @@ def test_cant_set_segment_indices(self, u):
 class TestAtomnames(TestAtomAttr):
     values = np.array(['O', 'C', 'CA', 'N', 'CB', 'CG', 'CD', 'NA', 'CL', 'OW'],
                       dtype=np.object)
+    single_value = 'Ca2'
     attrclass = tpattrs.Atomnames
 
 

From b7b20b37baabdbe57fbe072eae055a6396cdee2e Mon Sep 17 00:00:00 2001
From: Richard Gowers <richardjgowers@gmail.com>
Date: Tue, 16 Jun 2020 14:19:06 +0100
Subject: [PATCH 06/14] Update test_topologyattrs.py

---
 testsuite/MDAnalysisTests/core/test_topologyattrs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
index b4c2dc14f23..e9e68d8717f 100644
--- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py
+++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
@@ -114,7 +114,8 @@ def test_set_atoms_singular(self, attr):
         # set len 2 Group to len 1 value
         dg = DummyGroup([3, 7])
         attr.set_atoms(dg, self.single_value)
-        assert_equal(attr.get_atoms(dg), np.array([567, 567]))
+        assert_equal(attr.get_atoms(dg),
+                     np.array([self.single_value, self.single_value]))
 
     def test_set_atoms_plural(self, attr):
         # set len 2 Group to len 2 values

From 5611f967c7aaedacd8729439d2b1da0af98f8fd5 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 21 Jun 2020 20:59:06 +0100
Subject: [PATCH 07/14] use dict-lookup string attrs EVERYWHERERE

---
 package/MDAnalysis/core/selection.py     |  57 +++++---
 package/MDAnalysis/core/topologyattrs.py | 166 ++++++++++++++++++-----
 2 files changed, 176 insertions(+), 47 deletions(-)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 543cab5f4d8..5316c75fbaf 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -529,22 +529,9 @@ def __init__(self, parser, tokens):
         self.values = vals
 
     @return_empty_on_apply
-    def apply(self, group):
-        mask = np.zeros(len(group), dtype=np.bool)
-        values = getattr(group, self.field)
-        for val in self.values:
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
-
-
-class AtomNameSelection(StringSelection):
-    """Select atoms based on 'names' attribute"""
-    token = 'name'
-    field = 'names'
-
     def apply(self, group):
         # rather than work on group.names, cheat and look at the lookup table
-        nmattr = group.universe._topology.names
+        nmattr = getattr(group.universe._topology, self.field)
 
         matches = []  # list of passing indices
         # iterate through set of known atom names, check which pass
@@ -558,6 +545,12 @@ def apply(self, group):
         return group[np.in1d(nmidx, matches)].unique
 
 
+class AtomNameSelection(StringSelection):
+    """Select atoms based on 'names' attribute"""
+    token = 'name'
+    field = 'names'
+
+
 class AtomTypeSelection(StringSelection):
     """Select atoms based on 'types' attribute"""
     token = 'type'
@@ -576,13 +569,30 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(StringSelection):
+    def apply(self, group):
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[group.resindices]
+
+        return group[np.in1d(nmidx, matches)].unique    
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
@@ -593,6 +603,21 @@ class SegmentNameSelection(StringSelection):
     token = 'segid'
     field = 'segids'
 
+    def apply(self, group):
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = group.universe._topology.segids
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[group.segindices]
+
+        return group[np.in1d(nmidx, matches)].unique    
+
 
 class AltlocSelection(StringSelection):
     """Select atoms based on 'altLoc' attribute"""
diff --git a/package/MDAnalysis/core/topologyattrs.py b/package/MDAnalysis/core/topologyattrs.py
index caacf7657d9..a591f09cb3a 100644
--- a/package/MDAnalysis/core/topologyattrs.py
+++ b/package/MDAnalysis/core/topologyattrs.py
@@ -473,16 +473,7 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, na + 1)
 
 
-# TODO: update docs to property doc
-class Atomnames(AtomAttr):
-    """Name for each atom.
-    """
-    attrname = 'names'
-    singular = 'name'
-    per_object = 'atom'
-    dtype = object
-    transplants = defaultdict(list)
-
+class _AtomStringAttr(AtomAttr):
     def __init__(self, vals, guessed=False):
         self._guessed = guessed
       
@@ -538,6 +529,17 @@ def set_atoms(self, ag, values):
             self.name_lookup = np.concatenate([self.name_lookup, newnames])
         self.values = self.name_lookup[self.nmidx]
 
+
+# TODO: update docs to property doc
+class Atomnames(_AtomStringAttr):
+    """Name for each atom.
+    """
+    attrname = 'names'
+    singular = 'name'
+    per_object = 'atom'
+    dtype = object
+    transplants = defaultdict(list)
+
     def phi_selection(residue, c_name='C', n_name='N', ca_name='CA'):
         """Select AtomGroup corresponding to the phi protein backbone dihedral
         C'-N-CA-C.
@@ -1009,20 +1011,16 @@ def chi1_selections(residues, n_name='N', ca_name='CA', cb_name='CB',
 
 
 # TODO: update docs to property doc
-class Atomtypes(AtomAttr):
+class Atomtypes(_AtomStringAttr):
     """Type for each atom"""
     attrname = 'types'
     singular = 'type'
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 # TODO: update docs to property doc
-class Elements(AtomAttr):
+class Elements(_AtomStringAttr):
     """Element for each atom"""
     attrname = 'elements'
     singular = 'element'
@@ -1046,7 +1044,7 @@ def _gen_initial_values(na, nr, ns):
         return np.zeros(na)
 
 
-class RecordTypes(AtomAttr):
+class RecordTypes(_AtomStringAttr):
     """For PDB-like formats, indicates if ATOM or HETATM
 
     Defaults to 'ATOM'
@@ -1064,7 +1062,7 @@ def _gen_initial_values(na, nr, ns):
         return np.array(['ATOM'] * na, dtype=object)
 
 
-class ChainIDs(AtomAttr):
+class ChainIDs(_AtomStringAttr):
     """ChainID per atom
 
     Note
@@ -1076,10 +1074,6 @@ class ChainIDs(AtomAttr):
     per_object = 'atom'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(na)], dtype=object)
-
 
 class Tempfactors(AtomAttr):
     """Tempfactor for atoms"""
@@ -1625,7 +1619,7 @@ def _gen_initial_values(na, nr, ns):
 
 
 # TODO: update docs to property doc
-class AltLocs(AtomAttr):
+class AltLocs(_AtomStringAttr):
     """AltLocs for each atom"""
     attrname = 'altLocs'
     singular = 'altLoc'
@@ -1778,8 +1772,65 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
+class _ResidueStringAttr(ResidueAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_residues(self, rg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[rg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+
 # TODO: update docs to property doc
-class Resnames(ResidueAttr):
+class Resnames(_ResidueStringAttr):
     attrname = 'resnames'
     singular = 'resname'
     transplants = defaultdict(list)
@@ -1898,18 +1949,14 @@ def _gen_initial_values(na, nr, ns):
         return np.arange(1, nr + 1)
 
 
-class ICodes(ResidueAttr):
+class ICodes(_ResidueStringAttr):
     """Insertion code for Atoms"""
     attrname = 'icodes'
     singular = 'icode'
     dtype = object
 
-    @staticmethod
-    def _gen_initial_values(na, nr, ns):
-        return np.array(['' for _ in range(nr)], dtype=object)
-
 
-class Moltypes(ResidueAttr):
+class Moltypes(_ResidueStringAttr):
     """Name of the molecule type
 
     Two molecules that share a molecule type share a common template topology.
@@ -1961,8 +2008,65 @@ def set_segments(self, sg, values):
         self.values[sg.ix] = values
 
 
+class _SegmentStringAttr(SegmentAttr):
+    def __init__(self, vals, guessed=False):
+        self._guessed = guessed
+      
+        self.namedict = dict()  # maps str to nmidx
+        name_lookup = []  # maps idx to str
+        # eg namedict['O'] = 5 & name_lookup[5] = 'O'
+
+        self.nmidx = np.zeros_like(vals, dtype=int)  # the lookup for each atom
+        # eg Atom 5 is 'C', so nmidx[5] = 7, where name_lookup[7] = 'C'
+
+        for i, val in enumerate(vals):
+            try:
+                self.nmidx[i] = self.namedict[val]
+            except KeyError:
+                nextidx = len(self.namedict)
+                self.namedict[val] = nextidx
+                name_lookup.append(val)
+
+                self.nmidx[i] = nextidx
+
+        self.name_lookup = np.array(name_lookup, dtype=object)
+        self.values = self.name_lookup[self.nmidx]    
+
+    @staticmethod
+    def _gen_initial_values(na, nr, ns):
+        return np.array(['' for _ in range(nr)], dtype=object)
+
+    @_check_length
+    def set_segments(self, sg, values):
+        newnames = []
+
+        # two possibilities, either single value given, or one per Atom
+        if isinstance(values, str):
+            try:
+                newidx = self.namedict[values]
+            except KeyError:
+                newidx = len(self.namedict)
+                self.namedict[values] = newidx
+                newnames.append(values)
+        else:
+            newidx = np.zeros_like(values, dtype=int)
+            for i, val in enumerate(values):
+                try:
+                    newidx[i] = self.namedict[val]
+                except KeyError:
+                    nextidx = len(self.namedict)
+                    self.namedict[val] = nextidx
+                    newnames.append(val)
+                    newidx[i] = nextidx
+
+        self.nmidx[sg.ix] = newidx  # newidx either single value or same size array
+        if newnames:
+            self.name_lookup = np.concatenate([self.name_lookup, newnames])
+        self.values = self.name_lookup[self.nmidx]    
+
+        
 # TODO: update docs to property doc
-class Segids(SegmentAttr):
+class Segids(_SegmentStringAttr):
     attrname = 'segids'
     singular = 'segid'
     transplants = defaultdict(list)

From 6a6636131eddba234f749b634e065889c978c611 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 5 Jul 2020 11:06:06 +0100
Subject: [PATCH 08/14] removed some code duplication

made protein selection faster, 48ms -> 0.5ms on GRO testfile
---
 package/CHANGELOG                    |  2 +
 package/MDAnalysis/core/selection.py | 57 ++++++++++------------------
 2 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/package/CHANGELOG b/package/CHANGELOG
index e5e506d82a1..6325737a0b0 100644
--- a/package/CHANGELOG
+++ b/package/CHANGELOG
@@ -58,6 +58,8 @@ Enhancements
   * Added Hydrogen Bond Lifetime keyword "between" (PR #2791)
   * Dead code removed from the TPR parser and increased test coverage (PR #2840)
   * TPR parser exposes the elements topology attribute (PR #2858, see Issue #2553)
+  * Improved performance of select_atoms on strings (e.g. name, type, resname) and
+    'protein' selection (#2751 PR #2755)
 
 Changes
   * deprecated NumPy type aliases have been replaced with their actual types
diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 5316c75fbaf..3282b4425b2 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -515,7 +515,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -540,11 +540,15 @@ def apply(self, group):
                 matches.append(ix)
 
         # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.ix]
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
 
         return group[np.in1d(nmidx, matches)].unique
 
 
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
+
+
 class AtomNameSelection(StringSelection):
     """Select atoms based on 'names' attribute"""
     token = 'name'
@@ -569,21 +573,8 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class _ResidueStringSelection(StringSelection):
-    def apply(self, group):
-        # rather than work on group.names, cheat and look at the lookup table
-        nmattr = getattr(group.universe._topology, self.field)
-
-        matches = []  # list of passing indices
-        # iterate through set of known atom names, check which pass
-        for nm, ix in nmattr.namedict.items():
-            if any(fnmatch.fnmatch(nm, val) for val in self.values):
-                matches.append(ix)
-
-        # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.resindices]
-
-        return group[np.in1d(nmidx, matches)].unique    
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
 
 
 class ResidueNameSelection(_ResidueStringSelection):
@@ -598,25 +589,11 @@ class MoleculeTypeSelection(_ResidueStringSelection):
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
-
-    def apply(self, group):
-        # rather than work on group.names, cheat and look at the lookup table
-        nmattr = group.universe._topology.segids
-
-        matches = []  # list of passing indices
-        # iterate through set of known atom names, check which pass
-        for nm, ix in nmattr.namedict.items():
-            if any(fnmatch.fnmatch(nm, val) for val in self.values):
-                matches.append(ix)
-
-        # atomname indices for members of this group
-        nmidx = nmattr.nmidx[group.segindices]
-
-        return group[np.in1d(nmidx, matches)].unique    
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -845,7 +822,7 @@ class ProteinSelection(Selection):
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -868,14 +845,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):

From 3c4dc3265310a312930f79d9b8025a4598cd4f61 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sat, 11 Jul 2020 16:38:49 +0100
Subject: [PATCH 09/14] improved nucleic/backbone selections

---
 package/CHANGELOG                             |   1 +
 package/MDAnalysis/core/selection.py          | 119 +++++++++++++++---
 .../core/test_atomselections.py               |   2 +-
 3 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/package/CHANGELOG b/package/CHANGELOG
index 6325737a0b0..80e8161344a 100644
--- a/package/CHANGELOG
+++ b/package/CHANGELOG
@@ -39,6 +39,7 @@ Fixes
   * In hydrogenbonds.hbond_analysis.HydrogenbondAnalysis an AttributeError 
     was thrown when finding D-H pairs via the topology if `hydrogens` was an 
     empty AtomGroup (Issue #2848)
+  * Fixed performance regression on select_atoms for string selections (#2751)
 
 Enhancements
   * Refactored analysis.helanal into analysis.helix_analysis
diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 3282b4425b2..82f28e89980 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -819,6 +819,10 @@ class ProteinSelection(Selection):
     See Also
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
+
+    .. versionchanged:: 2.0.0
+       prot_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'protein'
 
@@ -873,23 +877,32 @@ class NucleicSelection(Selection):
 
     .. versionchanged:: 0.8
        additional Gromacs selections
+    .. versionchanged:: 2.0.0
+       nucl_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleic'
 
-    nucl_res = np.array([
+    nucl_res = {
         'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
         'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
         'DA5', 'DC5', 'DG5', 'DT5',
         'DA3', 'DC3', 'DG3', 'DT3',
         'RA5', 'RU5', 'RG5', 'RC5',
         'RA3', 'RU3', 'RG3', 'RC3'
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.nucl_res)
+        resnames = group.universe._topology.resnames
+        nmidx = resnames.nmidx[group.resindices]
+
+        matches = [ix for (nm, ix) in resnames.namedict.items()
+                   if nm in self.nucl_res]
+        mask = np.in1d(nmidx, matches)
+
         return group[mask].unique
 
 
@@ -898,14 +911,31 @@ class BackboneSelection(ProteinSelection):
 
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'backbone'
-    bb_atoms = np.array(['N', 'CA', 'C', 'O'])
+    bb_atoms = {'N', 'CA', 'C', 'O'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.prot_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicBackboneSelection(NucleicSelection):
@@ -913,14 +943,31 @@ class NucleicBackboneSelection(NucleicSelection):
 
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
+
+    .. versionchanged:: 2.0.0
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbackbone'
-    bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
+    bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class BaseSelection(NucleicSelection):
@@ -930,29 +977,63 @@ class BaseSelection(NucleicSelection):
 
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
+
+    .. versionchanged:: 2.0.0
+       base_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbase'
-    base_atoms = np.array([
+    base_atoms = {
         'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
         'O6', 'N2', 'N6',
-        'O2', 'N4', 'O4', 'C5M'])
+        'O2', 'N4', 'O4', 'C5M'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.base_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.base_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
+
+    .. versionchanged:: 2.0.0
+       sug_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
     sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.sug_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.sug_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class PropertySelection(Selection):
diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py
index b7d0f515b7f..4042bb84820 100644
--- a/testsuite/MDAnalysisTests/core/test_atomselections.py
+++ b/testsuite/MDAnalysisTests/core/test_atomselections.py
@@ -74,7 +74,7 @@ def test_protein(self, universe):
                      sorted(universe.select_atoms('segid 4AKE').indices),
                      "selected protein is not the same as auto-generated protein segment s4AKE")
 
-    @pytest.mark.parametrize('resname', MDAnalysis.core.selection.ProteinSelection.prot_res)
+    @pytest.mark.parametrize('resname', sorted(MDAnalysis.core.selection.ProteinSelection.prot_res))
     def test_protein_resnames(self, resname):
         u = make_Universe(('resnames',))
         # set half the residues' names to the resname we're testing

From 07757a50abee3d94ebb73932d43547c0a28de10c Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 26 Jul 2020 09:07:32 +0100
Subject: [PATCH 10/14] Added explicit tests for Resnames topologyattr

tests now provide str types for resnames/icodes
---
 .../core/test_topologyattrs.py                | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/testsuite/MDAnalysisTests/core/test_topologyattrs.py b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
index e9e68d8717f..270491514af 100644
--- a/testsuite/MDAnalysisTests/core/test_topologyattrs.py
+++ b/testsuite/MDAnalysisTests/core/test_topologyattrs.py
@@ -209,18 +209,19 @@ class TestResidueAttr(TopologyAttrMixin):
     """Test residue-level TopologyAttrs.
 
     """
+    single_value = 2
     values = np.array([15.2, 395.6, 0.1, 9.8])
     attrclass = tpattrs.ResidueAttr
 
-    def test_set_residue_VE(self):
-        u = make_Universe(('resnames',))
-        res = u.residues[0]
+    def test_set_residue_VE(self, universe):
+        # setting e.g. resname to 2 values should fail with VE
+        res = universe.residues[0]
         with pytest.raises(ValueError):
-            setattr(res, 'resname', ['wrong', 'length'])
+            setattr(res, self.attrclass.singular, self.values[:2])
 
     def test_get_atoms(self, attr):
         assert_equal(attr.get_atoms(DummyGroup([7, 3, 9])),
-                           self.values[[3, 2, 2]])
+                     self.values[[3, 2, 2]])
 
     def test_get_atom(self, universe):
         attr = getattr(universe.atoms[0], self.attrclass.singular)
@@ -228,14 +229,14 @@ def test_get_atom(self, universe):
 
     def test_get_residues(self, attr):
         assert_equal(attr.get_residues(DummyGroup([1, 2, 1, 3])),
-                           self.values[[1, 2, 1, 3]])
+                     self.values[[1, 2, 1, 3]])
 
     def test_set_residues_singular(self, attr):
         dg = DummyGroup([3, 0, 1])
-        attr.set_residues(dg, 2)
+        attr.set_residues(dg, self.single_value)
 
-        assert_almost_equal(attr.get_residues(dg),
-                                  np.array([2, 2, 2]))
+        assert_equal(attr.get_residues(dg),
+                     np.array([self.single_value]*3, dtype=self.values.dtype))
 
     def test_set_residues_plural(self, attr):
         attr.set_residues(DummyGroup([3, 0, 1]),
@@ -257,10 +258,17 @@ def test_get_segments(self, attr):
         assert_equal(attr.get_segments(DummyGroup([0, 1, 1])),
                            [self.values[[0, 3]], self.values[[1, 2]], self.values[[1, 2]]])
 
-class TestICodes(TestResidueAttr):
-    values = np.array(['a', 'b', '', 'd'])
+
+class TestResnames(TestResidueAttr):
+    attrclass = tpattrs.Resnames
+    single_value = 'xyz'
+    values = np.array(['a', 'b', '', 'd'], dtype=object)
+
+
+class TestICodes(TestResnames):
     attrclass = tpattrs.ICodes
 
+
 class TestResids(TestResidueAttr):
     values = np.array([10, 11, 18, 20])
     attrclass = tpattrs.Resids

From 95f4d15a4a8851f48f5b794a5f5e239e183a28b0 Mon Sep 17 00:00:00 2001
From: richard <richard@nextmovesoftware.com>
Date: Sun, 26 Jul 2020 09:09:16 +0100
Subject: [PATCH 11/14] use fnmatchcase to be case sensitive

---
 package/MDAnalysis/core/selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 82f28e89980..fa38ff474b3 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -536,7 +536,7 @@ def apply(self, group):
         matches = []  # list of passing indices
         # iterate through set of known atom names, check which pass
         for nm, ix in nmattr.namedict.items():
-            if any(fnmatch.fnmatch(nm, val) for val in self.values):
+            if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
                 matches.append(ix)
 
         # atomname indices for members of this group

From 5ae8edd4c0b0a76bddeb7f5aa28ecaa8ad6209d8 Mon Sep 17 00:00:00 2001
From: Oliver Beckstein <orbeckst@gmail.com>
Date: Sat, 15 Aug 2020 23:29:53 -0700
Subject: [PATCH 12/14] Update package/MDAnalysis/core/selection.py

@jbarnoud's fix
---
 package/MDAnalysis/core/selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index fa38ff474b3..47e51d1a858 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -1015,7 +1015,7 @@ class NucleicSugarSelection(NucleicSelection):
        performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
-    sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
+    sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}
 
     def apply(self, group):
         atomnames = group.universe._topology.names

From 640c411285dbfdae44161739ed1ff842a8e06560 Mon Sep 17 00:00:00 2001
From: Oliver Beckstein <orbeckst@gmail.com>
Date: Tue, 18 Aug 2020 23:37:57 -0700
Subject: [PATCH 13/14] apply suggestions from code review

Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com>
---
 package/MDAnalysis/core/selection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
index 47e51d1a858..dc8747832ea 100644
--- a/package/MDAnalysis/core/selection.py
+++ b/package/MDAnalysis/core/selection.py
@@ -820,6 +820,7 @@ class ProteinSelection(Selection):
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
 
+
     .. versionchanged:: 2.0.0
        prot_res changed to set (from numpy array)
        performance improved by ~100x on larger systems
@@ -912,6 +913,7 @@ class BackboneSelection(ProteinSelection):
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
 
+
     .. versionchanged:: 2.0.0
        bb_atoms changed to set (from numpy array)
        performance improved by ~100x on larger systems
@@ -944,6 +946,7 @@ class NucleicBackboneSelection(NucleicSelection):
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
 
+
     .. versionchanged:: 2.0.0
        bb_atoms changed to set (from numpy array)
        performance improved by ~100x on larger systems
@@ -978,6 +981,7 @@ class BaseSelection(NucleicSelection):
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
 
+
     .. versionchanged:: 2.0.0
        base_atoms changed to set (from numpy array)
        performance improved by ~100x on larger systems
@@ -1010,6 +1014,7 @@ def apply(self, group):
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
 
+
     .. versionchanged:: 2.0.0
        sug_atoms changed to set (from numpy array)
        performance improved by ~100x on larger systems

From ee64c32bc3db2575301166e688b9de038e0197e4 Mon Sep 17 00:00:00 2001
From: richardjgowers <richardjgowers@gmail.com>
Date: Tue, 25 Aug 2020 11:30:09 +0100
Subject: [PATCH 14/14] added test for setting multiple segids at once

---
 .../MDAnalysisTests/core/test_segmentgroup.py  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/testsuite/MDAnalysisTests/core/test_segmentgroup.py b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
index 3f0c251e543..546c5ad44cf 100644
--- a/testsuite/MDAnalysisTests/core/test_segmentgroup.py
+++ b/testsuite/MDAnalysisTests/core/test_segmentgroup.py
@@ -88,6 +88,24 @@ def test_set_segid_updates_(universe):
                  err_msg="old selection was not changed in place after set_segid")
 
 
+def test_set_segids_many():
+    u = mda.Universe.empty(n_atoms=6, n_residues=2, n_segments=2,
+                           atom_resindex=[0, 0, 0, 1, 1, 1], residue_segindex=[0,1])
+    u.add_TopologyAttr('segids', ['A', 'B'])
+
+    # universe with 2 segments, A and B
+
+    u.segments.segids = ['X', 'Y']
+
+    assert u.segments[0].segid == 'X'
+    assert u.segments[1].segid == 'Y'
+
+    assert len(u.select_atoms('segid A')) == 0
+    assert len(u.select_atoms('segid B')) == 0
+    assert len(u.select_atoms('segid X')) == 3
+    assert len(u.select_atoms('segid Y')) == 3
+
+
 def test_atom_order(universe):
     assert_equal(universe.segments.atoms.indices,
                  sorted(universe.segments.atoms.indices))