From 4046c55327a8e8bbcdadfaf523d61208940ec0d3 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:03:48 -0800 Subject: [PATCH 01/21] stopped overwrite of chainIDs by segids but allow segids to be chainIDs if chainID is blank --- package/MDAnalysis/coordinates/PDB.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8a6a944ca53..e516b5c1cf2 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1051,6 +1051,8 @@ def _write_timestep(self, ts, multiframe=False): Writing now only uses the contents of the elements attribute instead of guessing by default. If the elements are missing, empty records are written out (Issue #2423). + Atoms are now checked for a chainID instead of being overwritten + by the last letter of the SegmentID (Issue #3144). """ atoms = self.obj.atoms @@ -1080,6 +1082,7 @@ def get_attr(attrname, default): altlocs = get_attr('altLocs', ' ') resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') + chainids = get_attr('chainIDs', ' ') segids = get_attr('segids', ' ') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) @@ -1087,6 +1090,10 @@ def get_attr(attrname, default): atomnames = get_attr('names', 'X') elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') + chainids = ( + [id[:4] if (chainids[index] != '') else segids[index][-1:] + for index, id in enumerate(chainids)] + ) # check for a chainID, if no chainID default to segid # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. @@ -1107,13 +1114,15 @@ def get_attr(attrname, default): vals['name'] = self._deduce_PDB_atom_name(atomnames[i], resnames[i]) vals['altLoc'] = altlocs[i][:1] vals['resName'] = resnames[i][:4] - vals['chainID'] = segids[i][-1:] vals['resSeq'] = util.ltruncate_int(resids[i], 4) vals['iCode'] = icodes[i][:1] vals['pos'] = pos[i] # don't take off atom so conversion works vals['occupancy'] = occupancies[i] vals['tempFactor'] = tempfactors[i] vals['segID'] = segids[i][:4] + vals['chainID'] = ( + chainids[i][:4] if (chainids[i][:4] != ' ') else segids[i][-1:] + ) # check to see if there is a valid chainID else segID vals['element'] = elements[i][:2].upper() # record_type attribute, if exists, can be ATOM or HETATM From cbc5b6e0ea7ec76a330d3452d579466a830a48f3 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:04:38 -0800 Subject: [PATCH 02/21] added test for chainID overwrite by segids --- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 5f2b18c439f..8813114805c 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -463,6 +463,15 @@ def test_segid_chainid(self, universe2, outfile): u_pdb = mda.Universe(outfile) assert u_pdb.segments.chainIDs[0][0] == ref_id + def test_segid_chainid_overwrite(self, universe3, outfile): + """check if chainID gets round-tripped after file write (issue #3144)""" + ref_id = 'x' + u = universe3 + u.atoms.chainIDs = ref_id + u.atoms.write(outfile) + u_pdb = mda.Universe(outfile) + assert u_pdb.atoms.chainIDs[0] == ref_id + def test_stringio_outofrange(self, universe3): """ Check that when StringIO is used, the correct out-of-range error for From a1447baac40b9e74e1e66b9f46ae6bdc40ccfb4e Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:05:05 -0800 Subject: [PATCH 03/21] updated changelog --- package/CHANGELOG | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 7fa27bb5178..4609b31347d 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -16,16 +16,17 @@ The rules for this file: ??/??/?? tylerjereddy, richardjgowers, IAlibay, hmacdope, orbeckst, cbouy, lilyminium, daveminh, jbarnoud, yuxuanzhuang, VOD555, ianmkenney, calcraven,xiki-tempula, mieczyslaw, manuel.nuno.melo, PicoCentauri, - hanatok, rmeli, aditya-kamath, tirkarthi + hanatok, rmeli, aditya-kamath, tirkarthi, Henry Kobin * 2.0.0 Fixes + * Fixed chainID being overwritten by segid during file write. (Issue #3144) * ValueError raised when empty atomgroup is given to DensityAnalysis - without a user defined grid. UserWarning displayed when user defined + without a user defined grid. UserWarning displayed when user defined grid is provided. (Issue #3055) * Fixed import of several `__all__` lists (PR #3013) - * atommethods (_get_prev/next_residues_by_resid) returns empty residue group + * atommethods (_get_prev/next_residues_by_resid) returns empty residue group when given empty residue group (Issue #3089) * MOL2 files without bonds can now be read and written (Issue #3057) * Write `CONECT` record only once in multi-model PDB files (Issue #3018) @@ -97,7 +98,7 @@ Enhancements whitespace for ranges (Issue #3054, PR #2927) * Added `atol` and `rtol` keywords for float comparison (PR #2927) * Improved performance of the ParmEd converter (Issue #3028, PR #3029) - * Improved analysis class docstrings, and added missing classes to the + * Improved analysis class docstrings, and added missing classes to the `__all__` list (PR #2998) * The PDB writer gives more control over how to write the atom ids (Issue #1072, PR #2886) @@ -131,7 +132,7 @@ Enhancements 'protein' selection (#2751 PR #2755) * Added an RDKit converter that works for any input with all hydrogens explicit in the topology (Issue #2468, PR #2775) - + Changes * Introduces encore specific C compiler arguments to allow for lowering of optimisations on non-x86 platforms (Issue #1389, PR #3149) @@ -164,7 +165,7 @@ Changes * Sets the minimal RDKit version for CI to 2020.03.1 (Issue #2827, PR #2831) * Removes deprecated waterdynamics.HydrogenBondLifetimes (PR #2842) * Make NeighborSearch return empty atomgroup, residue, segments instead of list (Issue #2892, PR #2907) - * Updated Universe creation function signatures to named arguments (Issue #2921) + * Updated Universe creation function signatures to named arguments (Issue #2921) * The transformation was changed from a function/closure to a class with `__call__` (Issue #2860, PR #2859) * deprecated ``analysis.helanal`` module has been removed in favour of From 41d14aedb51d228d5864c345e8569c1b93854a61 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:14:01 -0800 Subject: [PATCH 04/21] pep8 corrections --- package/MDAnalysis/coordinates/PDB.py | 4 ++-- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index e516b5c1cf2..8b37dc9ec26 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1092,8 +1092,8 @@ def get_attr(attrname, default): record_types = get_attr('record_types', 'ATOM') chainids = ( [id[:4] if (chainids[index] != '') else segids[index][-1:] - for index, id in enumerate(chainids)] - ) # check for a chainID, if no chainID default to segid + for index, id in enumerate(chainids)] + ) # check for a chainID, if no chainID default to segid # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 8813114805c..54a46ccee9a 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -456,7 +456,7 @@ def get_MODEL_lines(filename): assert int(line[10:14]) == model % 10000 def test_segid_chainid(self, universe2, outfile): - """check whether chainID comes from last character of segid (issue #2224)""" + """check if chainID comes from last character of segid (issue #2224)""" ref_id = 'E' u = universe2 u.atoms.write(outfile) From d1c94fd39e2382555e355578a603b026119ba609 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:21:07 -0800 Subject: [PATCH 05/21] another pep8 fix --- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 54a46ccee9a..4b3bfa504b2 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -464,7 +464,7 @@ def test_segid_chainid(self, universe2, outfile): assert u_pdb.segments.chainIDs[0][0] == ref_id def test_segid_chainid_overwrite(self, universe3, outfile): - """check if chainID gets round-tripped after file write (issue #3144)""" + """check if chainID gets round-tripped after write (issue #3144)""" ref_id = 'x' u = universe3 u.atoms.chainIDs = ref_id From 48bf64d777405dc4bcc486727ba63b9b303c0fa5 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:24:25 -0800 Subject: [PATCH 06/21] Update package/CHANGELOG Co-authored-by: Lily Wang <31115101+lilyminium@users.noreply.github.com> --- package/CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 4609b31347d..0b0184cb64c 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -16,7 +16,7 @@ The rules for this file: ??/??/?? tylerjereddy, richardjgowers, IAlibay, hmacdope, orbeckst, cbouy, lilyminium, daveminh, jbarnoud, yuxuanzhuang, VOD555, ianmkenney, calcraven,xiki-tempula, mieczyslaw, manuel.nuno.melo, PicoCentauri, - hanatok, rmeli, aditya-kamath, tirkarthi, Henry Kobin + hanatok, rmeli, aditya-kamath, tirkarthi, HenryKobin * 2.0.0 From 50de0b6a6ca9b60249fcbdeb2b20ac65c418c18b Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 17:25:26 -0800 Subject: [PATCH 07/21] Update package/MDAnalysis/coordinates/PDB.py Co-authored-by: Lily Wang <31115101+lilyminium@users.noreply.github.com> --- package/MDAnalysis/coordinates/PDB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8b37dc9ec26..785c8f4bc4a 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1052,7 +1052,7 @@ def _write_timestep(self, ts, multiframe=False): instead of guessing by default. If the elements are missing, empty records are written out (Issue #2423). Atoms are now checked for a chainID instead of being overwritten - by the last letter of the SegmentID (Issue #3144). + by the last letter of the `segid` (Issue #3144). """ atoms = self.obj.atoms From 2cb56e86ca9de2b8151de19a2e50686dae08d8f6 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 19:00:54 -0800 Subject: [PATCH 08/21] corrections to PDBParser --- package/AUTHORS | 3 ++- package/CHANGELOG | 6 ++++-- package/MDAnalysis/coordinates/PDB.py | 16 ++++++---------- .../MDAnalysisTests/coordinates/test_pdb.py | 11 +++++++---- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/package/AUTHORS b/package/AUTHORS index 93dcb47b8a8..c0b38d62cf7 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -154,7 +154,8 @@ Chronological list of authors - Karthikeyan Singaravelan 2021 - Aditya Kamath - + - Henry Kobin + External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index 4609b31347d..b1b889ceaa6 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -16,12 +16,14 @@ The rules for this file: ??/??/?? tylerjereddy, richardjgowers, IAlibay, hmacdope, orbeckst, cbouy, lilyminium, daveminh, jbarnoud, yuxuanzhuang, VOD555, ianmkenney, calcraven,xiki-tempula, mieczyslaw, manuel.nuno.melo, PicoCentauri, - hanatok, rmeli, aditya-kamath, tirkarthi, Henry Kobin + hanatok, rmeli, aditya-kamath, tirkarthi, HenryKobin * 2.0.0 Fixes - * Fixed chainID being overwritten by segid during file write. (Issue #3144) + * PDBParser will check for the presence of the chainID attribute of an + atom group and use these values instead of just using the end of segid. + If no chainID attribute is present, then segid will be used. (Issue #3144) * ValueError raised when empty atomgroup is given to DensityAnalysis without a user defined grid. UserWarning displayed when user defined grid is provided. (Issue #3055) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8b37dc9ec26..8675b2f7fef 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1051,8 +1051,9 @@ def _write_timestep(self, ts, multiframe=False): Writing now only uses the contents of the elements attribute instead of guessing by default. If the elements are missing, empty records are written out (Issue #2423). - Atoms are now checked for a chainID instead of being overwritten - by the last letter of the SegmentID (Issue #3144). + chainID now comes from the AtomGroup if the attribute is present. + If no chainID is present then the chainID will default to the l + last letter of segid (Issue #3144). """ atoms = self.obj.atoms @@ -1082,18 +1083,15 @@ def get_attr(attrname, default): altlocs = get_attr('altLocs', ' ') resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') - chainids = get_attr('chainIDs', ' ') segids = get_attr('segids', ' ') + chainids = getattr(atoms,'chainIDs', segids) resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) atomnames = get_attr('names', 'X') elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') - chainids = ( - [id[:4] if (chainids[index] != '') else segids[index][-1:] - for index, id in enumerate(chainids)] - ) # check for a chainID, if no chainID default to segid + # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. @@ -1120,9 +1118,7 @@ def get_attr(attrname, default): vals['occupancy'] = occupancies[i] vals['tempFactor'] = tempfactors[i] vals['segID'] = segids[i][:4] - vals['chainID'] = ( - chainids[i][:4] if (chainids[i][:4] != ' ') else segids[i][-1:] - ) # check to see if there is a valid chainID else segID + vals['chainID'] = chainids[i][-1:] vals['element'] = elements[i][:2].upper() # record_type attribute, if exists, can be ATOM or HETATM diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 4b3bfa504b2..665ad09d09f 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -27,7 +27,7 @@ import numpy as np import pytest from MDAnalysisTests import make_Universe -from MDAnalysisTests.coordinates.base import _SingleFrameReader +from MDAnalysisTests.coordinates.base impaort _SingleFrameReader from MDAnalysisTests.coordinates.reference import (RefAdKSmall, RefAdK) from MDAnalysisTests.datafiles import (PDB, PDB_small, PDB_multiframe, @@ -456,7 +456,7 @@ def get_MODEL_lines(filename): assert int(line[10:14]) == model % 10000 def test_segid_chainid(self, universe2, outfile): - """check if chainID comes from last character of segid (issue #2224)""" + """Check if chainID comes from last character of segid (issue #2224)""" ref_id = 'E' u = universe2 u.atoms.write(outfile) @@ -464,13 +464,16 @@ def test_segid_chainid(self, universe2, outfile): assert u_pdb.segments.chainIDs[0][0] == ref_id def test_segid_chainid_overwrite(self, universe3, outfile): - """check if chainID gets round-tripped after write (issue #3144)""" + """ + Check that chainID attribute is not overwritten by SegmentID + if the chainID attribute is present (issue #3144) + """ ref_id = 'x' u = universe3 u.atoms.chainIDs = ref_id u.atoms.write(outfile) u_pdb = mda.Universe(outfile) - assert u_pdb.atoms.chainIDs[0] == ref_id + assert_equal(u_pdb.atoms.chainIDs[0], ref_id) def test_stringio_outofrange(self, universe3): """ From d2814a15c37297ac43e46cbca28d8cbd9be5a23b Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 19:05:02 -0800 Subject: [PATCH 09/21] whitespace correction --- package/MDAnalysis/coordinates/PDB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 53248de64c1..7a2cc612be7 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1083,7 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') - chainids = getattr(atoms,'chainIDs', segids) + chainids = getattr(atoms, 'chainIDs', segids) resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) From 4282daeb54e204902573338647cb241e0a5c775b Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sat, 13 Mar 2021 19:25:17 -0800 Subject: [PATCH 10/21] fixed typo --- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 665ad09d09f..1ef3cd8a1ea 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -27,7 +27,7 @@ import numpy as np import pytest from MDAnalysisTests import make_Universe -from MDAnalysisTests.coordinates.base impaort _SingleFrameReader +from MDAnalysisTests.coordinates.base import _SingleFrameReader from MDAnalysisTests.coordinates.reference import (RefAdKSmall, RefAdK) from MDAnalysisTests.datafiles import (PDB, PDB_small, PDB_multiframe, From 004b0111f86c967bd2b007d88b6c10d0d9a58ce1 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Sun, 14 Mar 2021 13:35:15 -0700 Subject: [PATCH 11/21] version where chainid defaults to end of segid if invalid --- package/MDAnalysis/coordinates/PDB.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 7a2cc612be7..8624fa9b44d 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1083,7 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') - chainids = getattr(atoms, 'chainIDs', segids) + chainids = get_attr('chainIDs', ' ') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) @@ -1091,6 +1091,9 @@ def get_attr(attrname, default): elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') + # If non alphanumeric occurs in chainids, resort back to using segids . + if not any([id.isalnum() for id in chainids]): + chainids = segids # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. From b52d88fbe8bd0a744d6eb7c3344dd18624df2b14 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 16 Mar 2021 12:19:30 -0700 Subject: [PATCH 12/21] added function to validate chainIDs on an individual basis and updated tests to reflect changes --- package/MDAnalysis/coordinates/PDB.py | 43 ++++++++++++++++--- .../MDAnalysisTests/coordinates/test_pdb.py | 35 ++++++++------- 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8624fa9b44d..e435e60f725 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1083,7 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') - chainids = get_attr('chainIDs', ' ') + chainids = get_attr('chainIDs', 'X') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) @@ -1091,9 +1091,42 @@ def get_attr(attrname, default): elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') - # If non alphanumeric occurs in chainids, resort back to using segids . - if not any([id.isalnum() for id in chainids]): - chainids = segids + def validate_chainids(chainids, default): + """Validate each atom's chainID + + chainids - np array of chainIDs + default - default value in case chainID is considered invalid + """ + invalid_length_ids = False + invalid_char_ids = False + missing_ids = False + + for (i, id) in enumerate(chainids): + if id is None: + missing_ids = True + chainids[i] = default + elif len(id) > 1: + invalid_length_ids = True + chainids[i] = default + elif not id.isalnum(): + invalid_char_ids = True + chainids[i] = default + + if invalid_length_ids: + warnings.warn("Found chainIDs with invalid length." + " Corresponding atoms will use value of '{}'" + "".format(default)) + if invalid_char_ids: + warnings.warn("Found chainIDs using unnaccepted character." + " Corresponding atoms will use value of '{}'" + "".format(default)) + if missing_ids: + warnings.warn("Found missing chainIDs." + " Corresponding atoms will use value of '{}'" + "".format(default)) + return chainids + + chainids = validate_chainids(chainids, "X") # If reindex == False, we use the atom ids for the serial. We do not # want to use a fallback here. @@ -1120,7 +1153,7 @@ def get_attr(attrname, default): vals['occupancy'] = occupancies[i] vals['tempFactor'] = tempfactors[i] vals['segID'] = segids[i][:4] - vals['chainID'] = chainids[i][-1:] + vals['chainID'] = chainids[i] vals['element'] = elements[i][:2].upper() # record_type attribute, if exists, can be ATOM or HETATM diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 1ef3cd8a1ea..5032e1c7e30 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -455,25 +455,30 @@ def get_MODEL_lines(filename): # test number (only last 4 digits) assert int(line[10:14]) == model % 10000 - def test_segid_chainid(self, universe2, outfile): - """Check if chainID comes from last character of segid (issue #2224)""" - ref_id = 'E' - u = universe2 - u.atoms.write(outfile) - u_pdb = mda.Universe(outfile) - assert u_pdb.segments.chainIDs[0][0] == ref_id - - def test_segid_chainid_overwrite(self, universe3, outfile): + def test_chainid_validated(self, universe3, outfile): """ - Check that chainID attribute is not overwritten by SegmentID - if the chainID attribute is present (issue #3144) + Check that an atom's chainID is set to 'X' if the chainID + does not confirm to standards (issue #2224) """ - ref_id = 'x' + default_id = 'X' + bad_id = '@' + u = universe3 + u.atoms.chainIDs = bad_id + u.atoms.write(outfile) + u_pdb = mda.Universe(outfile) + assert_equal(u_pdb.segments.chainIDs[0][0], default_id) + bad_id = 'AA' + u = universe3 + u.atoms.chainIDs = bad_id + u.atoms.write(outfile) + u_pdb = mda.Universe(outfile) + assert_equal(u_pdb.segments.chainIDs[0][0], default_id) + bad_id = ' ' u = universe3 - u.atoms.chainIDs = ref_id + u.atoms.chainIDs = bad_id u.atoms.write(outfile) u_pdb = mda.Universe(outfile) - assert_equal(u_pdb.atoms.chainIDs[0], ref_id) + assert_equal(u_pdb.segments.chainIDs[0][0], default_id) def test_stringio_outofrange(self, universe3): """ @@ -1084,7 +1089,7 @@ def test_atomname_alignment(self, writtenstuff): assert_equal(written[:16], reference) def test_atomtype_alignment(self, writtenstuff): - result_line = ("ATOM 1 H5T GUA A 1 7.974 6.430 9.561" + result_line = ("ATOM 1 H5T GUA X 1 7.974 6.430 9.561" " 1.00 0.00 RNAA \n") assert_equal(writtenstuff[9], result_line) From f144c55c25b3d5c47cce42d6dd09ac220518e7ea Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 16 Mar 2021 12:25:10 -0700 Subject: [PATCH 13/21] updated CHANGELOG --- package/CHANGELOG | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index b1b889ceaa6..34201e8a6ee 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -23,7 +23,9 @@ The rules for this file: Fixes * PDBParser will check for the presence of the chainID attribute of an atom group and use these values instead of just using the end of segid. - If no chainID attribute is present, then segid will be used. (Issue #3144) + If no chainID attribute is present, then a default value will be + provided. If the chainID for an atom is invalid (longer than one character, + not alpha-numeric, blank) it will be replaced with a default. (Issue #3144) * ValueError raised when empty atomgroup is given to DensityAnalysis without a user defined grid. UserWarning displayed when user defined grid is provided. (Issue #3055) From 4506f3398666342460d4acbf54e558a3725218ef Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 30 Mar 2021 12:22:36 -0700 Subject: [PATCH 14/21] changed using the term id to chainid to avoid python default conflicts --- package/MDAnalysis/coordinates/PDB.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index e435e60f725..5c7b14aeefd 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1051,8 +1051,8 @@ def _write_timestep(self, ts, multiframe=False): Writing now only uses the contents of the elements attribute instead of guessing by default. If the elements are missing, empty records are written out (Issue #2423). - Atoms are now checked for a chainID instead of being overwritten - by the last letter of the `segid` (Issue #3144). + Atoms are now checked for a valid chainID instead of being + overwritten by the last letter of the `segid` (Issue #3144). """ atoms = self.obj.atoms @@ -1083,7 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') - chainids = get_attr('chainIDs', 'X') + chainids = get_attr('chainIDs', ' ') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) @@ -1101,14 +1101,14 @@ def validate_chainids(chainids, default): invalid_char_ids = False missing_ids = False - for (i, id) in enumerate(chainids): - if id is None: + for (i, chainid) in enumerate(chainids): + if chainid is None: missing_ids = True chainids[i] = default - elif len(id) > 1: + elif len(chainid) > 1: invalid_length_ids = True chainids[i] = default - elif not id.isalnum(): + elif not chainid.isalnum(): invalid_char_ids = True chainids[i] = default From cbe3624783508f64913f0cd7125dd3cdb6e1eacb Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 30 Mar 2021 12:23:03 -0700 Subject: [PATCH 15/21] if no segids present then default to blank value --- package/MDAnalysis/topology/PDBParser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index 468d04dc985..f551a31d308 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -288,10 +288,6 @@ def _parseatoms(self): warnings.warn("Serial numbers went over 100,000. " "Higher serials have been guessed") - # If segids not present, try to use chainids - if not any(segids): - segids = chainids - n_atoms = len(serials) attrs = [] @@ -360,7 +356,7 @@ def _parseatoms(self): attrs.append(Segids(segids)) else: n_segments = 1 - attrs.append(Segids(np.array(['SYSTEM'], dtype=object))) + attrs.append(Segids(np.array([' '], dtype=object))) segidx = None top = Topology(n_atoms, n_residues, n_segments, From d505a14609eaca1530fdc4b14713dac25f4f22e2 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 30 Mar 2021 12:24:05 -0700 Subject: [PATCH 16/21] added to changelog on PDBParser.py --- package/MDAnalysis/topology/PDBParser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index f551a31d308..c8275f563e8 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -184,6 +184,7 @@ class PDBParser(TopologyReaderBase): Bonds attribute is not added if no bonds are present in PDB file. If elements are invalid or partially missing, empty elements records are now assigned (Issue #2422). + If no segids present default to blank value.(Issue #3144) """ format = ['PDB', 'ENT'] From 14a3836ccb4f745759360190d40280862c71a476 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 6 Apr 2021 18:44:56 -0700 Subject: [PATCH 17/21] segids default to blank instead of chainIDs or 'SYSTEM' --- package/MDAnalysis/topology/PDBParser.py | 3 ++- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index c8275f563e8..7aeb17a8433 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -184,7 +184,8 @@ class PDBParser(TopologyReaderBase): Bonds attribute is not added if no bonds are present in PDB file. If elements are invalid or partially missing, empty elements records are now assigned (Issue #2422). - If no segids present default to blank value.(Issue #3144) + If segids are not present they will default to blank instead + of SYSTEM.(Issue #3144). """ format = ['PDB', 'ENT'] diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 5032e1c7e30..d50fc1254c1 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -300,7 +300,7 @@ def test_writer_no_icodes(self, u_no_names, outfile): def test_writer_no_segids(self, u_no_names, outfile): u_no_names.atoms.write(outfile) u = mda.Universe(outfile) - expected = np.array(['SYSTEM'] * u_no_names.atoms.n_atoms) + expected = np.array([' '] * u_no_names.atoms.n_atoms) assert_equal([atom.segid for atom in u.atoms], expected) def test_writer_no_occupancies(self, u_no_names, outfile): From 0d0bd971eaa9440159ea7da5aca0fcb9e18aec0d Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Tue, 6 Apr 2021 20:28:37 -0700 Subject: [PATCH 18/21] reverted changes to pdbparser --- package/MDAnalysis/topology/PDBParser.py | 8 +++++--- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index 7aeb17a8433..468d04dc985 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -184,8 +184,6 @@ class PDBParser(TopologyReaderBase): Bonds attribute is not added if no bonds are present in PDB file. If elements are invalid or partially missing, empty elements records are now assigned (Issue #2422). - If segids are not present they will default to blank instead - of SYSTEM.(Issue #3144). """ format = ['PDB', 'ENT'] @@ -290,6 +288,10 @@ def _parseatoms(self): warnings.warn("Serial numbers went over 100,000. " "Higher serials have been guessed") + # If segids not present, try to use chainids + if not any(segids): + segids = chainids + n_atoms = len(serials) attrs = [] @@ -358,7 +360,7 @@ def _parseatoms(self): attrs.append(Segids(segids)) else: n_segments = 1 - attrs.append(Segids(np.array([' '], dtype=object))) + attrs.append(Segids(np.array(['SYSTEM'], dtype=object))) segidx = None top = Topology(n_atoms, n_residues, n_segments, diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index d50fc1254c1..35a2c158be4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -300,7 +300,7 @@ def test_writer_no_icodes(self, u_no_names, outfile): def test_writer_no_segids(self, u_no_names, outfile): u_no_names.atoms.write(outfile) u = mda.Universe(outfile) - expected = np.array([' '] * u_no_names.atoms.n_atoms) + expected = np.array(['X'] * u_no_names.atoms.n_atoms) assert_equal([atom.segid for atom in u.atoms], expected) def test_writer_no_occupancies(self, u_no_names, outfile): From 0782a230aa1a98033beea86d50ae33f6b7444774 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Wed, 7 Apr 2021 17:24:03 -0700 Subject: [PATCH 19/21] parametrized test for checking invalid chainid on write of pdb file and corrected check for empty string in pdb.py --- package/MDAnalysis/coordinates/PDB.py | 2 +- .../MDAnalysisTests/coordinates/test_pdb.py | 19 ++++--------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 5c7b14aeefd..91d584fb344 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1102,7 +1102,7 @@ def validate_chainids(chainids, default): missing_ids = False for (i, chainid) in enumerate(chainids): - if chainid is None: + if chainid == "": missing_ids = True chainids[i] = default elif len(chainid) > 1: diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 35a2c158be4..41d52aa0aa1 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -455,27 +455,16 @@ def get_MODEL_lines(filename): # test number (only last 4 digits) assert int(line[10:14]) == model % 10000 - def test_chainid_validated(self, universe3, outfile): + @pytest.mark.parametrize("bad_chainid", + ['@','','AA']) + def test_chainid_validated(self, universe3, outfile, bad_chainid): """ Check that an atom's chainID is set to 'X' if the chainID does not confirm to standards (issue #2224) """ default_id = 'X' - bad_id = '@' u = universe3 - u.atoms.chainIDs = bad_id - u.atoms.write(outfile) - u_pdb = mda.Universe(outfile) - assert_equal(u_pdb.segments.chainIDs[0][0], default_id) - bad_id = 'AA' - u = universe3 - u.atoms.chainIDs = bad_id - u.atoms.write(outfile) - u_pdb = mda.Universe(outfile) - assert_equal(u_pdb.segments.chainIDs[0][0], default_id) - bad_id = ' ' - u = universe3 - u.atoms.chainIDs = bad_id + u.atoms.chainIDs = bad_chainid u.atoms.write(outfile) u_pdb = mda.Universe(outfile) assert_equal(u_pdb.segments.chainIDs[0][0], default_id) From 9659693ff3e497850586003d8b1655aa5959c678 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Wed, 7 Apr 2021 17:26:51 -0700 Subject: [PATCH 20/21] pep8 --- testsuite/MDAnalysisTests/coordinates/test_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 41d52aa0aa1..92b970094d4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -456,7 +456,7 @@ def get_MODEL_lines(filename): assert int(line[10:14]) == model % 10000 @pytest.mark.parametrize("bad_chainid", - ['@','','AA']) + ['@', '', 'AA']) def test_chainid_validated(self, universe3, outfile, bad_chainid): """ Check that an atom's chainID is set to 'X' if the chainID From 9f0998c21ee39a063113b4a3a3c57ec9d724d347 Mon Sep 17 00:00:00 2001 From: Henry Kobin Date: Wed, 7 Apr 2021 20:18:43 -0700 Subject: [PATCH 21/21] Update package/MDAnalysis/coordinates/PDB.py Co-authored-by: Lily Wang <31115101+lilyminium@users.noreply.github.com> --- package/MDAnalysis/coordinates/PDB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 91d584fb344..badf47e8e9d 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1083,7 +1083,7 @@ def get_attr(attrname, default): resnames = get_attr('resnames', 'UNK') icodes = get_attr('icodes', ' ') segids = get_attr('segids', ' ') - chainids = get_attr('chainIDs', ' ') + chainids = get_attr('chainIDs', '') resids = get_attr('resids', 1) occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0)