diff --git a/package/CHANGELOG b/package/CHANGELOG index 652c485e043..c12b68350ce 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -20,6 +20,10 @@ The rules for this file: * 2.0.0 Fixes + * PDB parser now assigns empty records for partially missing/invalid elements + instead of not assigning the elements attribute at all (Issue #2422) + * PDB writer now uses elements attribute instead of guessing elements from + atom name (Issue #2423) * Cleanup and parametrization of test_atomgroup.py (Issue #2995) * The methods provided by topology attributes now appear in the documentation (Issue #1845) diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py index 3dd15e1a380..a9a377b7e86 100644 --- a/package/MDAnalysis/coordinates/PDB.py +++ b/package/MDAnalysis/coordinates/PDB.py @@ -1033,6 +1033,9 @@ def _write_timestep(self, ts, multiframe=False): When only :attr:`record_types` attribute is present, instead of using ATOM_ for both ATOM_ and HETATM_, HETATM_ record types are properly written out (Issue #1753). + Writing now only uses the contents of the elements attribute + instead of guessing by default. If the elements are missing, + empty records are written out (Issue #2423). """ atoms = self.obj.atoms @@ -1067,6 +1070,7 @@ def get_attr(attrname, default): occupancies = get_attr('occupancies', 1.0) tempfactors = get_attr('tempfactors', 0.0) atomnames = get_attr('names', 'X') + elements = get_attr('elements', ' ') record_types = get_attr('record_types', 'ATOM') # If reindex == False, we use the atom ids for the serial. We do not @@ -1095,7 +1099,7 @@ def get_attr(attrname, default): vals['occupancy'] = occupancies[i] vals['tempFactor'] = tempfactors[i] vals['segID'] = segids[i][:4] - vals['element'] = guess_atom_element(atomnames[i].strip())[:2] + vals['element'] = elements[i][:2].upper() # record_type attribute, if exists, can be ATOM or HETATM try: diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py index 38a03f3686f..468d04dc985 100644 --- a/package/MDAnalysis/topology/PDBParser.py +++ b/package/MDAnalysis/topology/PDBParser.py @@ -35,15 +35,13 @@ :mod:`~MDAnalysis.topology.ExtendedPDBParser`) that can handle residue numbers up to 99,999. -TODO: - Add attributes to guess elements for non-physical or missing elements .. Note:: The parser processes atoms and their names. Masses are guessed and set to 0 if unknown. Partial charges are not set. Elements are parsed if they are - valid. + valid. If partially missing or incorrect, empty records are assigned. See Also -------- @@ -183,7 +181,9 @@ class PDBParser(TopologyReaderBase): .. versionchanged:: 1.0.0 Added parsing of valid Elements .. versionchanged:: 2.0.0 - Bonds attribute is not added if no bonds are present in PDB file + Bonds attribute is not added if no bonds are present in PDB file. + If elements are invalid or partially missing, empty elements records + are now assigned (Issue #2422). """ format = ['PDB', 'ENT'] @@ -220,7 +220,6 @@ def _parseatoms(self): icodes = [] tempfactors = [] occupancies = [] - atomtypes = [] resids = [] resnames = [] @@ -283,7 +282,6 @@ def _parseatoms(self): tempfactors.append(float_or_default(line[60:66], 1.0)) # AKA bfactor segids.append(line[66:76].strip()) - atomtypes.append(line[76:78].strip()) # Warn about wrapped serials if self._wrapped_serials: @@ -311,28 +309,33 @@ def _parseatoms(self): # Guessed attributes # masses from types if they exist # OPT: We do this check twice, maybe could refactor to avoid this - if not any(atomtypes): + if not any(elements): atomtypes = guess_types(names) attrs.append(Atomtypes(atomtypes, guessed=True)) + warnings.warn("Element information is missing, elements attribute " + "will not be populated. If needed these can be " + "guessed using MDAnalysis.topology.guessers.") else: - attrs.append(Atomtypes(np.array(atomtypes, dtype=object))) + # Feed atomtypes as raw element column, but validate elements + atomtypes = elements + attrs.append(Atomtypes(np.array(elements, dtype=object))) + + validated_elements = [] + for elem in elements: + if elem.capitalize() in SYMB2Z: + validated_elements.append(elem.capitalize()) + else: + wmsg = (f"Unknown element {elem} found for some atoms. " + f"These have been given an empty element record. " + f"If needed they can be guessed using " + f"MDAnalysis.topology.guessers.") + warnings.warn(wmsg) + validated_elements.append('') + attrs.append(Elements(np.array(validated_elements, dtype=object))) masses = guess_masses(atomtypes) attrs.append(Masses(masses, guessed=True)) - # Getting element information from element column. - if all(elements): - element_set = set(i.capitalize() for i in set(elements)) - if all(element in SYMB2Z for element in element_set): - element_list = [i.capitalize() for i in elements] - attrs.append(Elements(np.array(element_list, dtype=object))) - else: - warnings.warn("Invalid elements found in the PDB file, " - "elements attributes will not be populated.") - else: - warnings.warn("Element information is absent or missing for a few " - "atoms. Elements attributes will not be populated.") - # Residue level stuff from here resids = np.array(resids, dtype=np.int32) resnames = np.array(resnames, dtype=object) diff --git a/testsuite/MDAnalysisTests/analysis/test_hole2.py b/testsuite/MDAnalysisTests/analysis/test_hole2.py index c4c63d161cd..380eceba95e 100644 --- a/testsuite/MDAnalysisTests/analysis/test_hole2.py +++ b/testsuite/MDAnalysisTests/analysis/test_hole2.py @@ -301,16 +301,11 @@ def test_context_manager(self, universe, tmpdir): def test_output_level(self, tmpdir, universe): with tmpdir.as_cwd(): - with pytest.warns(UserWarning) as rec: + with pytest.warns(UserWarning, match='needs to be < 3'): h = hole2.HoleAnalysis(universe, output_level=100) h.run(start=self.start, stop=self.stop, random_seed=self.random_seed) - assert len(rec) == 5 - - assert any('needs to be < 3' in r.message.args[0] for r in rec) - assert any('has no dt information' in r.message.args[0] for r in rec) # 2x - assert any('Unit cell dimensions not found.' in r.message.args[0] for r in rec) # 2x # no profiles assert len(h.profiles) == 0 diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py index 3d4b2c9737a..445d8ec35a0 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_pdb.py +++ b/testsuite/MDAnalysisTests/coordinates/test_pdb.py @@ -45,6 +45,16 @@ IGNORE_NO_INFORMATION_WARNING = 'ignore:Found no information for attr:UserWarning' +@pytest.fixture +def dummy_universe_without_elements(): + n_atoms = 5 + u = make_Universe(size=(n_atoms, 1, 1), trajectory=True) + u.add_TopologyAttr('resnames', ['RES']) + u.add_TopologyAttr('names', ['C1', 'O2', 'N3', 'S4', 'NA']) + u.dimensions = [42, 42, 42, 90, 90, 90] + return u + + class TestPDBReader(_SingleFrameReader): __test__ = True @@ -1028,7 +1038,7 @@ def test_atomname_alignment(self, writtenstuff): def test_atomtype_alignment(self, writtenstuff): result_line = ("ATOM 1 H5T GUA A 1 7.974 6.430 9.561" - " 1.00 0.00 RNAA H\n") + " 1.00 0.00 RNAA \n") assert_equal(writtenstuff[9], result_line) @@ -1161,6 +1171,63 @@ def test_partially_missing_cryst(): assert_array_almost_equal(u.dimensions, 0.0) +@pytest.mark.filterwarnings(IGNORE_NO_INFORMATION_WARNING) +def test_write_no_atoms_elements(dummy_universe_without_elements): + """ + If no element symbols are provided, the PDB writer guesses. + """ + destination = StringIO() + with mda.coordinates.PDB.PDBWriter(destination) as writer: + writer.write(dummy_universe_without_elements.atoms) + content = destination.getvalue() + element_symbols = [ + line[76:78].strip() + for line in content.splitlines() + if line[:6] == 'ATOM ' + ] + expectation = ['', '', '', '', ''] + assert element_symbols == expectation + + +@pytest.mark.filterwarnings(IGNORE_NO_INFORMATION_WARNING) +def test_write_atom_elements(dummy_universe_without_elements): + """ + If element symbols are provided, they are used when writing the file. + + See `Issue 2423 `_. + """ + elems = ['S', 'O', '', 'C', 'Na'] + expectation = ['S', 'O', '', 'C', 'NA'] + dummy_universe_with_elements = dummy_universe_without_elements + dummy_universe_with_elements.add_TopologyAttr('elements', elems) + destination = StringIO() + with mda.coordinates.PDB.PDBWriter(destination) as writer: + writer.write(dummy_universe_without_elements.atoms) + content = destination.getvalue() + element_symbols = [ + line[76:78].strip() + for line in content.splitlines() + if line[:6] == 'ATOM ' + ] + assert element_symbols == expectation + + +def test_elements_roundtrip(tmpdir): + """ + Roundtrip test for PDB elements reading/writing. + """ + u = mda.Universe(CONECT) + elements = u.atoms.elements + + outfile = os.path.join(str(tmpdir), 'elements.pdb') + with mda.coordinates.PDB.PDBWriter(outfile) as writer: + writer.write(u.atoms) + + u_written = mda.Universe(outfile) + + assert_equal(elements, u_written.atoms.elements) + + def test_cryst_meaningless_warning(): # issue 2599 # FIXME: This message might change with Issue #2698 diff --git a/testsuite/MDAnalysisTests/topology/test_pdb.py b/testsuite/MDAnalysisTests/topology/test_pdb.py index 3f7f76a7272..28e2abfaf29 100644 --- a/testsuite/MDAnalysisTests/topology/test_pdb.py +++ b/testsuite/MDAnalysisTests/topology/test_pdb.py @@ -23,7 +23,6 @@ from io import StringIO import pytest -import warnings import numpy as np from numpy.testing import assert_equal import MDAnalysis as mda @@ -66,10 +65,12 @@ ("10267", 10267) ] + @pytest.mark.parametrize('hybrid, integer', hybrid36) def test_hy36decode(hybrid, integer): assert mda.topology.PDBParser.hy36decode(5, hybrid) == integer + class PDBBase(ParserBase): expected_attrs = ['ids', 'names', 'record_types', 'resids', 'resnames', 'altLocs', 'icodes', 'occupancies', @@ -205,6 +206,7 @@ def test_PDB_record_types(): ENDMDL """ + def test_PDB_no_resid(): u = mda.Universe(StringIO(PDB_noresid), format='PDB') @@ -213,6 +215,7 @@ def test_PDB_no_resid(): # should have default resid of 1 assert u.residues[0].resid == 1 + PDB_hex = """\ REMARK For testing reading of hex atom numbers REMARK This has MODELs then hex atom numbers entries @@ -226,6 +229,7 @@ def test_PDB_no_resid(): END """ + def test_PDB_hex(): u = mda.Universe(StringIO(PDB_hex), format='PDB') assert len(u.atoms) == 5 @@ -298,37 +302,25 @@ def test_PDB_elements(): assert_equal(u.atoms.elements, element_list) -PDB_missing_ele = """\ -REMARK For testing warnings in case of absent element information -ATOM 1 N ASN A 1 -8.901 4.127 -0.555 1.00 0.00 -ATOM 2 CA ASN A 1 -8.608 3.135 -1.618 1.00 0.00 -ATOM 3 C ASN A 1 -7.117 2.964 -1.897 1.00 0.00 -ATOM 4 O ASN A 1 -6.634 1.849 -1.758 1.00 0.00 -TER 5 -HETATM 6 CU CU A 2 03.000 00.000 00.000 1.00 00.00 -HETATM 7 FE FE A 3 00.000 03.000 00.000 1.00 00.00 -HETATM 8 Mg Mg A 4 03.000 03.000 03.000 1.00 00.00 -TER 9 -""" - +def test_missing_elements_noattribute(): + """Check that: -def test_missing_elements_warnings(): - """The test checks whether it returns the appropriate warning on - encountering missing elements. + 1) a warning is raised if elements are missing + 2) the elements attribute is not set """ - with pytest.warns(UserWarning) as record: - u = mda.Universe(StringIO(PDB_missing_ele), format='PDB') - - assert len(record) == 1 - assert record[0].message.args[0] == "Element information is absent or "\ - "missing for a few atoms. Elements attributes will not be populated." + wmsg = ("Element information is missing, elements attribute will not be " + "populated") + with pytest.warns(UserWarning, match=wmsg): + u = mda.Universe(PDB_small) + with pytest.raises(AttributeError): + _ = u.atoms.elements PDB_wrong_ele = """\ REMARK For testing warnings of wrong elements REMARK This file represent invalid elements in the elements column ATOM 1 N ASN A 1 -8.901 4.127 -0.555 1.00 0.00 N -ATOM 2 CA ASN A 1 -8.608 3.135 -1.618 1.00 0.00 C +ATOM 2 CA ASN A 1 -8.608 3.135 -1.618 1.00 0.00 ATOM 3 C ASN A 1 -7.117 2.964 -1.897 1.00 0.00 C ATOM 4 O ASN A 1 -6.634 1.849 -1.758 1.00 0.00 O ATOM 5 X ASN A 1 -9.437 3.396 -2.889 1.00 0.00 XX @@ -344,12 +336,12 @@ def test_wrong_elements_warnings(): """The test checks whether there are invalid elements in the elements column which have been parsed and returns an appropriate warning. """ - with pytest.warns(UserWarning) as record: + with pytest.warns(UserWarning, match='Unknown element XX found'): u = mda.Universe(StringIO(PDB_wrong_ele), format='PDB') - - assert len(record) == 2 - assert record[1].message.args[0] == "Invalid elements found in the PDB "\ - "file, elements attributes will not be populated." + + expected = np.array(['N', '', 'C', 'O', '', 'Cu', 'Fe', 'Mg'], + dtype=object) + assert_equal(u.atoms.elements, expected) def test_nobonds_error():