diff --git a/package/AUTHORS b/package/AUTHORS index 12318895610..ad93e39ca9c 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -162,7 +162,8 @@ Chronological list of authors - Hannah Pollak - Estefania Barreto-Ojeda - Paarth Thadani - + - Henry Kobin + External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index f2fd09f700a..a8948666c37 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -17,7 +17,7 @@ The rules for this file: lilyminium, daveminh, jbarnoud, yuxuanzhuang, VOD555, ianmkenney, calcraven,xiki-tempula, mieczyslaw, manuel.nuno.melo, PicoCentauri, hanatok, rmeli, aditya-kamath, tirkarthi, LeonardoBarneschi, hejamu, - biogen98, orioncohen, z3y50n, hp115, ojeda-e, thadanipaarth + biogen98, orioncohen, z3y50n, hp115, ojeda-e, thadanipaarth, HenryKobin * 2.0.0 @@ -30,6 +30,11 @@ Fixes zone/layer is empty, consistent with 'around' (Issue #2915) * A Universe created from an ROMol with no atoms returns now a Universe with 0 atoms (Issue #3142) + * PDBParser will check for the presence of the chainID attribute of an + atom group and use these values instead of just using the end of segid. + If no chainID attribute is present, then a default value will be + provided. If the chainID for an atom is invalid (longer than one character, + not alpha-numeric, blank) it will be replaced with a default. (Issue #3144) * ValueError raised when empty atomgroup is given to DensityAnalysis without a user defined grid. UserWarning displayed when user defined grid is provided. (Issue #3055) @@ -149,7 +154,6 @@ Enhancements * Added an RDKit converter that works for any input with all hydrogens explicit in the topology (Issue #2468, PR #2775) - Changes * Fixed inaccurate docstring inside the RMSD class (Issue #2796, PR #3134) * TPRParser now loads TPR files with `tpr_resid_from_one=True` by default, diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8a6a944ca53..badf47e8e9d 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1051,6 +1051,8 @@ def _write_timestep(self, ts, multiframe=False): Writing now only uses the contents of the elements attribute instead of guessing by default. If the elements are missing, empty records are written out (Issue #2423). + Atoms are now checked for a valid chainID instead of being + overwritten by the last letter of the `segid` (Issue #3144). """ atoms = self.obj.atoms @@ -1081,6 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') + chainids = get_attr('chainIDs', '') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) @@ -1088,6 +1091,43 @@ def get_attr(attrname, default): elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') + def validate_chainids(chainids, default): + """Validate each atom's chainID + + chainids - np array of chainIDs + default - default value in case chainID is considered invalid + """ + invalid_length_ids = False + invalid_char_ids = False + missing_ids = False + + for (i, chainid) in enumerate(chainids): + if chainid == "": + missing_ids = True + chainids[i] = default + elif len(chainid) > 1: + invalid_length_ids = True + chainids[i] = default + elif not chainid.isalnum(): + invalid_char_ids = True + chainids[i] = default + + if invalid_length_ids: + warnings.warn("Found chainIDs with invalid length." + " Corresponding atoms will use value of '{}'" + "".format(default)) + if invalid_char_ids: + warnings.warn("Found chainIDs using unnaccepted character." + " Corresponding atoms will use value of '{}'" + "".format(default)) + if missing_ids: + warnings.warn("Found missing chainIDs." + " Corresponding atoms will use value of '{}'" + "".format(default)) + return chainids + + chainids = validate_chainids(chainids, "X") + # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. if not self._reindex: @@ -1107,13 +1147,13 @@ def get_attr(attrname, default): vals['name'] = self._deduce_PDB_atom_name(atomnames[i], resnames[i]) vals['altLoc'] = altlocs[i][:1] vals['resName'] = resnames[i][:4] - vals['chainID'] = segids[i][-1:] vals['resSeq'] = util.ltruncate_int(resids[i], 4) vals['iCode'] = icodes[i][:1] vals['pos'] = pos[i] # don't take off atom so conversion works vals['occupancy'] = occupancies[i] vals['tempFactor'] = tempfactors[i] vals['segID'] = segids[i][:4] + vals['chainID'] = chainids[i] vals['element'] = elements[i][:2].upper() # record_type attribute, if exists, can be ATOM or HETATM diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 5f2b18c439f..92b970094d4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -300,7 +300,7 @@ def test_writer_no_icodes(self, u_no_names, outfile): def test_writer_no_segids(self, u_no_names, outfile): u_no_names.atoms.write(outfile) u = mda.Universe(outfile) - expected = np.array(['SYSTEM'] * u_no_names.atoms.n_atoms) + expected = np.array(['X'] * u_no_names.atoms.n_atoms) assert_equal([atom.segid for atom in u.atoms], expected) def test_writer_no_occupancies(self, u_no_names, outfile): @@ -455,13 +455,19 @@ def get_MODEL_lines(filename): # test number (only last 4 digits) assert int(line[10:14]) == model % 10000 - def test_segid_chainid(self, universe2, outfile): - """check whether chainID comes from last character of segid (issue #2224)""" - ref_id = 'E' - u = universe2 + @pytest.mark.parametrize("bad_chainid", + ['@', '', 'AA']) + def test_chainid_validated(self, universe3, outfile, bad_chainid): + """ + Check that an atom's chainID is set to 'X' if the chainID + does not confirm to standards (issue #2224) + """ + default_id = 'X' + u = universe3 + u.atoms.chainIDs = bad_chainid u.atoms.write(outfile) u_pdb = mda.Universe(outfile) - assert u_pdb.segments.chainIDs[0][0] == ref_id + assert_equal(u_pdb.segments.chainIDs[0][0], default_id) def test_stringio_outofrange(self, universe3): """ @@ -1072,7 +1078,7 @@ def test_atomname_alignment(self, writtenstuff): assert_equal(written[:16], reference) def test_atomtype_alignment(self, writtenstuff): - result_line = ("ATOM 1 H5T GUA A 1 7.974 6.430 9.561" + result_line = ("ATOM 1 H5T GUA X 1 7.974 6.430 9.561" " 1.00 0.00 RNAA \n") assert_equal(writtenstuff[9], result_line)