From 2031e90f62c2f858704e07a4c36e46b422f183e8 Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Thu, 6 Aug 2020 15:33:34 +0100 Subject: [PATCH 1/9] Add a reindex argument to the PDBWritter --- package/MDAnalysis/coordinates/PDB.py | 50 ++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 8b90c1bda9e..e2d9507a8e4 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -533,6 +533,9 @@ class PDBWriter(base.WriterBase): ChainID now comes from the last character of segid, as stated in the documentation. An indexing issue meant it previously used the first charater (Issue #2224) + .. versionchanged:: 2.0.0 + Add the "reindex" argument. + """ fmt = { 'ATOM': ( @@ -596,7 +599,7 @@ class PDBWriter(base.WriterBase): def __init__(self, filename, bonds="conect", n_atoms=None, start=0, step=1, remarks="Created by PDBWriter", - convert_units=True, multiframe=None): + convert_units=True, multiframe=None, reindex=True): """Create a new PDBWriter Parameters @@ -624,6 +627,9 @@ def __init__(self, filename, bonds="conect", n_atoms=None, start=0, step=1, ``False``: write a single frame to the file; ``True``: create a multi frame PDB file in which frames are written as MODEL_ ... ENDMDL_ records. If ``None``, then the class default is chosen. [``None``] + reindex: bool (optional) + If ``True`` (default), the atom serial is set to be consecutive + numbers starting at 1. Else, use the atom id. """ # n_atoms = None : dummy keyword argument @@ -637,6 +643,7 @@ def __init__(self, filename, bonds="conect", n_atoms=None, start=0, step=1, self.convert_units = convert_units self._multiframe = self.multiframe if multiframe is None else multiframe self.bonds = bonds + self._reindex = reindex if start < 0: raise ValueError("'Start' must be a positive value") @@ -792,26 +799,40 @@ def _write_pdb_bonds(self): if not self.obj or not hasattr(self.obj.universe, 'bonds'): return - mapping = {index: i for i, index in enumerate(self.obj.atoms.indices)} + if not self._reindex and not hasattr(self.obj.atoms, 'ids'): + return bondset = set(itertools.chain(*(a.bonds for a in self.obj.atoms))) + if self._reindex: + index_attribute = 'index' + mapping = {index: i for i, index in enumerate(self.obj.atoms.indices, start=1)} + atoms = np.sort(self.obj.atoms.indices) + else: + index_attribute = 'id' + mapping = {id_: id_ for id_ in self.obj.atoms.ids} + atoms = np.sort(self.obj.atoms.ids) if self.bonds == "conect": # Write out only the bonds that were defined in CONECT records - bonds = ((bond[0].index, bond[1].index) for bond in bondset if not bond.is_guessed) + bonds = ( + (getattr(bond[0], index_attribute), getattr(bond[1], index_attribute)) + for bond in bondset if not bond.is_guessed + ) elif self.bonds == "all": - bonds = ((bond[0].index, bond[1].index) for bond in bondset) + bonds = ( + (getattr(bond[0], index_attribute), getattr(bond[1], index_attribute)) + for bond in bondset + ) else: raise ValueError("bonds has to be either None, 'conect' or 'all'") con = collections.defaultdict(list) for a1, a2 in bonds: + print(a1, a2) if not (a1 in mapping and a2 in mapping): continue con[a2].append(a1) con[a1].append(a2) - atoms = np.sort(self.obj.atoms.indices) - conect = ([mapping[a]] + sorted([mapping[at] for at in con[a]]) for a in atoms if a in con) @@ -1042,9 +1063,22 @@ def get_attr(attrname, default): atomnames = get_attr('names', 'X') record_types = get_attr('record_types', 'ATOM') + # If reindex == False, we use the atom ids for the serial. We do not + # want to use a fallback here. + if not self._reindex: + try: + atom_ids = atoms.ids + except AttributeError: + raise NoDataError( + 'The "id" topology attribute is not set. ' + 'Either set the attribute or use reindex=True.' + ) + else: + atom_ids = np.arange(len(atoms)) + 1 + for i, atom in enumerate(atoms): vals = {} - vals['serial'] = util.ltruncate_int(i + 1, 5) # check for overflow here? + vals['serial'] = util.ltruncate_int(atom_ids[i], 5) # check for overflow here? vals['name'] = self._deduce_PDB_atom_name(atomnames[i], resnames[i]) vals['altLoc'] = altlocs[i][:1] vals['resName'] = resnames[i][:4] @@ -1153,7 +1187,7 @@ def CONECT(self, conect): """Write CONECT_ record. """ - conect = ["{0:5d}".format(entry + 1) for entry in conect] + conect = ["{0:5d}".format(entry) for entry in conect] conect = "".join(conect) self.pdbfile.write(self.fmt['CONECT'].format(conect)) From 5f989e4781a01460ec9622abe6b4ba6ce946575b Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 03:07:01 +0100 Subject: [PATCH 2/9] Tests for PDB reindex --- .../MDAnalysisTests/coordinates/test_pdb.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 9cd3fbc5989..0986e3d369f 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -513,6 +513,29 @@ def test_abnormal_record_type(self, universe5, tmpdir, outfile): with pytest.raises(ValueError, match=expected_msg): u.atoms.write(outfile) + def test_no_reindex(self, universe, tmpdir): + """ + When setting the `reindex` keyword to False, the atom are + not reindexed. + """ + pdb_path = str(tmpdir / 'output.pdb') + universe.atoms.ids = universe.atoms.ids + 23 + universe.atoms.write(pdb_path, reindex=False) + read_universe = mda.Universe(pdb_path) + assert np.all(read_universe.atoms.ids == universe.atoms.ids) + + def test_reindex(self, universe, tmpdir): + """ + When setting the `reindex` keyword to True, the atom are + reindexed. + """ + pdb_path = str(tmpdir / 'output.pdb') + universe.atoms.ids = universe.atoms.ids + 23 + universe.atoms.write(pdb_path, reindex=True) + read_universe = mda.Universe(pdb_path) + # AG.ids is 1-based, while AG.indices is 0-based, hence the +1 + assert np.all(read_universe.atoms.ids == universe.atoms.indices + 1) + class TestMultiPDBReader(object): @staticmethod From 73518c3c34232489f71aed69d6aefaa984a67e27 Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 03:08:41 +0100 Subject: [PATCH 3/9] Remove extra print --- package/MDAnalysis/coordinates/PDB.py | 1 - 1 file changed, 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index e2d9507a8e4..ff801da357e 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -827,7 +827,6 @@ def _write_pdb_bonds(self): con = collections.defaultdict(list) for a1, a2 in bonds: - print(a1, a2) if not (a1 in mapping and a2 in mapping): continue con[a2].append(a1) From 4f6d221b4521e972ed21962642ffca4edf70ed51 Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 03:38:10 +0100 Subject: [PATCH 4/9] Test PDB CONECT with reindex=False --- .../MDAnalysisTests/coordinates/test_pdb.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 0986e3d369f..486463a454c 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -524,6 +524,22 @@ def test_no_reindex(self, universe, tmpdir): read_universe = mda.Universe(pdb_path) assert np.all(read_universe.atoms.ids == universe.atoms.ids) + def test_no_reindex_bonds(self, universe, tmpdir): + """ + When setting the `reindex` keyword to False, the connect + record match the non-reindexed atoms. + """ + pdb_path = str(tmpdir / 'output.pdb') + universe.atoms.ids = universe.atoms.ids + 23 + universe.atoms.write(pdb_path, reindex=False, bonds='all') + with open(pdb_path) as infile: + for line in infile: + if line.startswith('CONECT'): + assert line.strip() == "CONECT 23 24 25 26 27" + break + else: + raise AssertError('No CONECT record fond in the output.') + def test_reindex(self, universe, tmpdir): """ When setting the `reindex` keyword to True, the atom are From 7d2c04159d4402e54e59b5ce24b9ddf1b2423bea Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 03:41:17 +0100 Subject: [PATCH 5/9] Update CHANGELOG --- package/CHANGELOG | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package/CHANGELOG b/package/CHANGELOG index 9844b14fc4d..c7ff9a04482 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -62,6 +62,8 @@ Fixes would create a test artifact (Issue #2979, PR #2981) Enhancements + * The PDB writer gives more control over how to write the atom ids + (Issue #1072, PR #2886) * Refactored analysis.helanal into analysis.helix_analysis (Issue #2452, PR #2622) * Improved MSD code to accept `AtomGroup` and reject `UpdatingAtomGroup` From 4ef36fe7f1d580a4bad5a590dd7316f07d924e1f Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 12:36:44 +0100 Subject: [PATCH 6/9] More precise version change for PDB reindex --- package/MDAnalysis/coordinates/PDB.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index ff801da357e..678a753d126 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -534,7 +534,8 @@ class PDBWriter(base.WriterBase): An indexing issue meant it previously used the first charater (Issue #2224) .. versionchanged:: 2.0.0 - Add the "reindex" argument. + Add the `redindex` argument. Setting this keyword to ``True`` + (the default) preserves the behavior in earlier versions of MDAnalysis. """ fmt = { From e009799e4009ed48764e6a8bf99b26d16954542c Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Fri, 16 Oct 2020 12:39:15 +0100 Subject: [PATCH 7/9] Fix long lines --- package/MDAnalysis/coordinates/PDB.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 678a753d126..68b8e75e52b 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -806,7 +806,10 @@ def _write_pdb_bonds(self): bondset = set(itertools.chain(*(a.bonds for a in self.obj.atoms))) if self._reindex: index_attribute = 'index' - mapping = {index: i for i, index in enumerate(self.obj.atoms.indices, start=1)} + mapping = { + index: i + for i, index in enumerate(self.obj.atoms.indices, start=1) + } atoms = np.sort(self.obj.atoms.indices) else: index_attribute = 'id' @@ -815,12 +818,18 @@ def _write_pdb_bonds(self): if self.bonds == "conect": # Write out only the bonds that were defined in CONECT records bonds = ( - (getattr(bond[0], index_attribute), getattr(bond[1], index_attribute)) + ( + getattr(bond[0], index_attribute), + getattr(bond[1], index_attribute), + ) for bond in bondset if not bond.is_guessed ) elif self.bonds == "all": bonds = ( - (getattr(bond[0], index_attribute), getattr(bond[1], index_attribute)) + ( + getattr(bond[0], index_attribute), + getattr(bond[1], index_attribute), + ) for bond in bondset ) else: From 57605b0b70fd93dac6ac421430bfbde66fa09aa8 Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Sat, 17 Oct 2020 08:55:03 +0100 Subject: [PATCH 8/9] Increase test coverage for #2886 Also silence some warnings for the tests in the PR. --- .../MDAnalysisTests/coordinates/test_pdb.py | 52 ++++++++++++++----- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 486463a454c..3d4b2c9737a 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -42,6 +42,8 @@ assert_array_almost_equal, assert_almost_equal) +IGNORE_NO_INFORMATION_WARNING = 'ignore:Found no information for attr:UserWarning' + class TestPDBReader(_SingleFrameReader): __test__ = True @@ -218,6 +220,22 @@ def universe_and_expected_dims(self, request): def outfile(self, tmpdir): return str(tmpdir.mkdir("PDBWriter").join('primitive-pdb-writer' + self.ext)) + @pytest.fixture + def u_no_ids(self): + # The test universe does not have atom ids, but it has everything + # else the PDB writer expects to avoid issuing warnings. + universe = make_Universe( + [ + 'names', 'resids', 'resnames', 'altLocs', + 'segids', 'occupancies', 'tempfactors', + ], + trajectory=True, + ) + universe.add_TopologyAttr('icodes', [' '] * len(universe.residues)) + universe.add_TopologyAttr('record_types', ['ATOM'] * len(universe.atoms)) + universe.dimensions = [10, 10, 10, 90, 90, 90] + return universe + @pytest.fixture def u_no_resnames(self): return make_Universe(['names', 'resids'], trajectory=True) @@ -513,26 +531,26 @@ def test_abnormal_record_type(self, universe5, tmpdir, outfile): with pytest.raises(ValueError, match=expected_msg): u.atoms.write(outfile) - def test_no_reindex(self, universe, tmpdir): + @pytest.mark.filterwarnings(IGNORE_NO_INFORMATION_WARNING) + def test_no_reindex(self, universe, outfile): """ When setting the `reindex` keyword to False, the atom are not reindexed. """ - pdb_path = str(tmpdir / 'output.pdb') universe.atoms.ids = universe.atoms.ids + 23 - universe.atoms.write(pdb_path, reindex=False) - read_universe = mda.Universe(pdb_path) + universe.atoms.write(outfile, reindex=False) + read_universe = mda.Universe(outfile) assert np.all(read_universe.atoms.ids == universe.atoms.ids) - def test_no_reindex_bonds(self, universe, tmpdir): + @pytest.mark.filterwarnings(IGNORE_NO_INFORMATION_WARNING) + def test_no_reindex_bonds(self, universe, outfile): """ When setting the `reindex` keyword to False, the connect record match the non-reindexed atoms. """ - pdb_path = str(tmpdir / 'output.pdb') universe.atoms.ids = universe.atoms.ids + 23 - universe.atoms.write(pdb_path, reindex=False, bonds='all') - with open(pdb_path) as infile: + universe.atoms.write(outfile, reindex=False, bonds='all') + with open(outfile) as infile: for line in infile: if line.startswith('CONECT'): assert line.strip() == "CONECT 23 24 25 26 27" @@ -540,18 +558,28 @@ def test_no_reindex_bonds(self, universe, tmpdir): else: raise AssertError('No CONECT record fond in the output.') - def test_reindex(self, universe, tmpdir): + @pytest.mark.filterwarnings(IGNORE_NO_INFORMATION_WARNING) + def test_reindex(self, universe, outfile): """ When setting the `reindex` keyword to True, the atom are reindexed. """ - pdb_path = str(tmpdir / 'output.pdb') universe.atoms.ids = universe.atoms.ids + 23 - universe.atoms.write(pdb_path, reindex=True) - read_universe = mda.Universe(pdb_path) + universe.atoms.write(outfile, reindex=True) + read_universe = mda.Universe(outfile) # AG.ids is 1-based, while AG.indices is 0-based, hence the +1 assert np.all(read_universe.atoms.ids == universe.atoms.indices + 1) + def test_no_reindex_missing_ids(self, u_no_ids, outfile): + """ + When setting `reindex` to False, if there is no AG.ids, + then an exception is raised. + """ + # Making sure AG.ids is indeed missing + assert not hasattr(u_no_ids.atoms, 'ids') + with pytest.raises(mda.exceptions.NoDataError): + u_no_ids.atoms.write(outfile, reindex=False) + class TestMultiPDBReader(object): @staticmethod From 7626052fd4d4554acda2dced2958335f1ffc6f7b Mon Sep 17 00:00:00 2001 From: Jonathan Barnoud Date: Sat, 17 Oct 2020 08:56:53 +0100 Subject: [PATCH 9/9] Remove useless branch in PDB.py There was a test in PDBWriter._write_pdb_bonds about writing the bonds for an atomgroup that does not have AG.ids but asking to not reindex. The test was silencing the problem and would lead to the bonds not being written. However, that part of the code would never be reached because writing the atoms would have raised an exception before that method is called. This commit removes this test. Now, if that part of the code is called (which should not happen), an exception is raised when the method tries to access AG.ids. --- package/MDAnalysis/coordinates/PDB.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 68b8e75e52b..3dd15e1a380 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -800,9 +800,6 @@ def _write_pdb_bonds(self): if not self.obj or not hasattr(self.obj.universe, 'bonds'): return - if not self._reindex and not hasattr(self.obj.atoms, 'ids'): - return - bondset = set(itertools.chain(*(a.bonds for a in self.obj.atoms))) if self._reindex: index_attribute = 'index'