MDAnalysis · lilyminium · Apr 8, 2021 · Mar 14, 2021 · Mar 14, 2021 · Mar 14, 2021
diff --git a/package/AUTHORS b/package/AUTHORS
@@ -162,7 +162,8 @@ Chronological list of authors
   - Hannah Pollak
   - Estefania Barreto-Ojeda
   - Paarth Thadani
-
+  - Henry Kobin
+
 External code
 -------------
 

diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -17,7 +17,7 @@ The rules for this file:
          lilyminium, daveminh, jbarnoud, yuxuanzhuang, VOD555, ianmkenney,
          calcraven，xiki-tempula, mieczyslaw, manuel.nuno.melo, PicoCentauri,
          hanatok, rmeli, aditya-kamath, tirkarthi, LeonardoBarneschi, hejamu,
-         biogen98, orioncohen, z3y50n, hp115, ojeda-e, thadanipaarth
+         biogen98, orioncohen, z3y50n, hp115, ojeda-e, thadanipaarth, HenryKobin
 
   * 2.0.0
 
@@ -30,6 +30,11 @@ Fixes
    zone/layer is empty, consistent with 'around' (Issue #2915)
   * A Universe created from an ROMol with no atoms returns now a Universe 
     with 0 atoms (Issue #3142)
+  * PDBParser will check for the presence of the chainID attribute of an
+    atom group and use these values instead of just using the end of segid.
+    If no chainID attribute is present, then a default value will be
+    provided. If the chainID for an atom is invalid (longer than one character,
+    not alpha-numeric, blank) it will be replaced with a default. (Issue #3144)
   * ValueError raised when empty atomgroup is given to DensityAnalysis
     without a user defined grid. UserWarning displayed when user defined
     grid is provided. (Issue #3055)
@@ -149,7 +154,6 @@ Enhancements
   * Added an RDKit converter that works for any input with all hydrogens
     explicit in the topology (Issue #2468, PR #2775)
 
-
 Changes
   * Fixed inaccurate docstring inside the RMSD class (Issue #2796, PR #3134) 
   * TPRParser now loads TPR files with `tpr_resid_from_one=True` by default,

diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py
@@ -1051,6 +1051,8 @@ def _write_timestep(self, ts, multiframe=False):
            Writing now only uses the contents of the elements attribute
            instead of guessing by default. If the elements are missing,
            empty records are written out (Issue #2423).
+           Atoms are now checked for a valid chainID instead of being
+           overwritten by the last letter of the `segid` (Issue #3144).
 
         """
         atoms = self.obj.atoms
@@ -1081,13 +1083,51 @@ def get_attr(attrname, default):
         resnames = get_attr('resnames', 'UNK')
         icodes = get_attr('icodes', ' ')
         segids = get_attr('segids', ' ')
+        chainids = get_attr('chainIDs', '')
         resids = get_attr('resids', 1)
         occupancies = get_attr('occupancies', 1.0)
         tempfactors = get_attr('tempfactors', 0.0)
         atomnames = get_attr('names', 'X')
         elements = get_attr('elements', ' ')
         record_types = get_attr('record_types', 'ATOM')
 
+        def validate_chainids(chainids, default):
+            """Validate each atom's chainID
+
+            chainids - np array of chainIDs
+            default - default value in case chainID is considered invalid
+            """
+            invalid_length_ids = False
+            invalid_char_ids = False
+            missing_ids = False
+
+            for (i, chainid) in enumerate(chainids):
+                if chainid == "":
+                    missing_ids = True
+                    chainids[i] = default
+                elif len(chainid) > 1:
+                    invalid_length_ids = True
+                    chainids[i] = default
+                elif not chainid.isalnum():
+                    invalid_char_ids = True
+                    chainids[i] = default
+
+            if invalid_length_ids:
+                warnings.warn("Found chainIDs with invalid length."
+                              " Corresponding atoms will use value of '{}'"
+                              "".format(default))
+            if invalid_char_ids:
+                warnings.warn("Found chainIDs using unnaccepted character."
+                              " Corresponding atoms will use value of '{}'"
+                              "".format(default))
+            if missing_ids:
+                warnings.warn("Found missing chainIDs."
+                              " Corresponding atoms will use value of '{}'"
+                              "".format(default))
+            return chainids
+
+        chainids = validate_chainids(chainids, "X")
+
         # If reindex == False, we use the atom ids for the serial. We do not
         # want to use a fallback here.
         if not self._reindex:
@@ -1107,13 +1147,13 @@ def get_attr(attrname, default):
             vals['name'] = self._deduce_PDB_atom_name(atomnames[i], resnames[i])
             vals['altLoc'] = altlocs[i][:1]
             vals['resName'] = resnames[i][:4]
-            vals['chainID'] = segids[i][-1:]
             vals['resSeq'] = util.ltruncate_int(resids[i], 4)
             vals['iCode'] = icodes[i][:1]
             vals['pos'] = pos[i]  # don't take off atom so conversion works
             vals['occupancy'] = occupancies[i]
             vals['tempFactor'] = tempfactors[i]
             vals['segID'] = segids[i][:4]
+            vals['chainID'] = chainids[i]
             vals['element'] = elements[i][:2].upper()
 
             # record_type attribute, if exists, can be ATOM or HETATM

diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py
@@ -300,7 +300,7 @@ def test_writer_no_icodes(self, u_no_names, outfile):
     def test_writer_no_segids(self, u_no_names, outfile):
         u_no_names.atoms.write(outfile)
         u = mda.Universe(outfile)
-        expected = np.array(['SYSTEM'] * u_no_names.atoms.n_atoms)
+        expected = np.array(['X'] * u_no_names.atoms.n_atoms)
         assert_equal([atom.segid for atom in u.atoms], expected)
 
     def test_writer_no_occupancies(self, u_no_names, outfile):
@@ -455,13 +455,19 @@ def get_MODEL_lines(filename):
             # test number (only last 4 digits)
             assert int(line[10:14]) == model % 10000
 
-    def test_segid_chainid(self, universe2, outfile):
-        """check whether chainID comes from last character of segid (issue #2224)"""
-        ref_id = 'E'
-        u = universe2
+    @pytest.mark.parametrize("bad_chainid",
+                             ['@', '', 'AA'])
+    def test_chainid_validated(self, universe3, outfile, bad_chainid):
+        """
+        Check that an atom's chainID is set to 'X' if the chainID
+        does not confirm to standards (issue #2224)
+        """
+        default_id = 'X'
+        u = universe3
+        u.atoms.chainIDs = bad_chainid
         u.atoms.write(outfile)
         u_pdb = mda.Universe(outfile)
-        assert u_pdb.segments.chainIDs[0][0] == ref_id
+        assert_equal(u_pdb.segments.chainIDs[0][0], default_id)
 
     def test_stringio_outofrange(self, universe3):
         """
@@ -1072,7 +1078,7 @@ def test_atomname_alignment(self, writtenstuff):
             assert_equal(written[:16], reference)
 
     def test_atomtype_alignment(self, writtenstuff):
-        result_line = ("ATOM      1  H5T GUA A   1       7.974   6.430   9.561"
+        result_line = ("ATOM      1  H5T GUA X   1       7.974   6.430   9.561"
                        "  1.00  0.00      RNAA  \n")
         assert_equal(writtenstuff[9], result_line)