diff --git a/package/AUTHORS b/package/AUTHORS index 26b02cfba1f..3dd116e9068 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -252,6 +252,8 @@ Chronological list of authors - Lexi Xu - BHM-Bob G - Yu-Yuan (Stuart) Yang + - James Rowe + External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index faeaf76b598..df2ed59c545 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -28,7 +28,8 @@ Fixes (Issue #4948, PR #5001) Enhancements - + * Improve parsing of topology information from LAMMPS dump files to allow + reading of mass, charge and element attributes. (Issue #3449, PR #4995) Changes Deprecations diff --git a/package/MDAnalysis/topology/LAMMPSParser.py b/package/MDAnalysis/topology/LAMMPSParser.py index c7cdcea30b1..6c3899b401b 100644 --- a/package/MDAnalysis/topology/LAMMPSParser.py +++ b/package/MDAnalysis/topology/LAMMPSParser.py @@ -98,12 +98,15 @@ Bonds, Charges, Dihedrals, + Elements, Impropers, Masses, Resids, Resnums, Segids, ) +from ..guesser.tables import SYMB2Z +from ..guesser.tables import masses as mass_table logger = logging.getLogger("MDAnalysis.topology.LAMMPS") @@ -602,14 +605,36 @@ def _parse_box(self, header): return unitcell +# Define the headers that define topology information from the lammps file +# Any headers outside of this set will be ignored +DUMP_HEADERS = { + "id": {"attr_class": Atomids, "default": None, "dtype": np.int32}, + "mol": {"attr_class": [Resids, Resnums], "default": 1, "dtype": int}, + "type": {"attr_class": Atomtypes, "default": 1, "dtype": object}, + "mass": {"attr_class": Masses, "default": 1.0, "dtype": np.float64}, + "element": {"attr_class": Elements, "default": None, "dtype": object}, + "q": {"attr_class": Charges, "default": None, "dtype": np.float32}, +} + + class LammpsDumpParser(TopologyReaderBase): - """Parses Lammps ascii dump files in 'atom' format. + """Parses Lammps ascii dump files. - Sets all masses to 1.0. + id, mol, type, mass, element and q columns are read + and used to set the corresponding topology attributes. + + All other columns are ignored. + + If masses are not provided they will attempt to be guessed + from element names. If no element names are provided then all masses + will be set to 1.0. - .. versionchanged:: 2.0.0 .. versionadded:: 0.19.0 + .. versionchanged:: 2.0.0 + Allow for a more flexible column layout + .. versionchanged:: 2.10.0 + Allow reading of mass, charge and element attributes """ format = "LAMMPSDUMP" @@ -627,33 +652,86 @@ def parse(self, **kwargs): fin.readline() # y fin.readline() # z - indices = np.zeros(natoms, dtype=int) - types = np.zeros(natoms, dtype=object) - + # Next line contains the headers for the atom data atomline = fin.readline() # ITEM ATOMS attrs = atomline.split()[2:] # attributes on coordinate line - col_ids = {attr: i for i, attr in enumerate(attrs)} # column ids - + col_ids = { + attr: i for i, attr in enumerate(attrs) if attr in DUMP_HEADERS + } # column ids + if "id" not in col_ids: + raise ValueError("No id column found in dump file") + if "mass" not in col_ids: + if "element" in col_ids: + warnings.warn( + "No mass column found in dump file. " + "Using guessed masses from element info." + ) + else: + warnings.warn("Guessed all Masses to 1.0") + if "type" not in col_ids: + warnings.warn("Set all atom types to 1") + + atom_data = {} + # Initialize the atom data arrays + for header in DUMP_HEADERS: + if header in col_ids: + atom_data[header] = np.zeros( + natoms, dtype=DUMP_HEADERS[header]["dtype"] + ) + elif DUMP_HEADERS[header]["default"] is not None: + atom_data[header] = np.full( + natoms, + DUMP_HEADERS[header]["default"], + dtype=DUMP_HEADERS[header]["dtype"], + ) + # Read the atom data for i in range(natoms): fields = fin.readline().split() + for header, col_id in col_ids.items(): + atom_data[header][i] = fields[col_id] + + # Check for valid elements and set masses by element if not given + if "element" in col_ids: + validated_elements = [] + for elem in atom_data["element"]: + if elem.capitalize() in SYMB2Z: + validated_elements.append(elem.capitalize()) + else: + wmsg = ( + f"Unknown element {elem} found for some atoms. " + f"These have been given an empty element record. " + ) + warnings.warn(wmsg) + validated_elements.append("") + atom_data["element"] = np.array( + validated_elements, dtype=DUMP_HEADERS["element"]["dtype"] + ) + if "mass" not in col_ids: + for i, elem in enumerate(validated_elements): + try: + atom_data["mass"][i] = mass_table[elem] + except KeyError: + atom_data["mass"][i] = 1.0 - indices[i] = fields[col_ids["id"]] - types[i] = fields[col_ids["type"]] - - order = np.argsort(indices) - indices = indices[order] - types = types[order] + # Reorder the atom data by id + order = np.argsort(atom_data["id"]) attrs = [] - attrs.append(Atomids(indices)) - attrs.append(Atomtypes(types)) - attrs.append(Masses(np.ones(natoms, dtype=np.float64), guessed=True)) - warnings.warn("Guessed all Masses to 1.0") - attrs.append(Resids(np.array([1], dtype=int))) - attrs.append(Resnums(np.array([1], dtype=int))) - attrs.append(Segids(np.array(["SYSTEM"], dtype=object))) + for key, value in atom_data.items(): + if key == "mol": + # Get the number of unique residues + # and the assignment of each atom to a residue + residx, resids = squash_by(value[order])[:2] + n_residues = len(resids) + for attr_class in DUMP_HEADERS[key]["attr_class"]: + attrs.append(attr_class(resids)) + else: + attrs.append(DUMP_HEADERS[key]["attr_class"](value[order])) - return Topology(natoms, 1, 1, attrs=attrs) + attrs.append(Segids(np.array(["SYSTEM"], dtype=object))) + return Topology( + natoms, n_residues, 1, attrs=attrs, atom_resindex=residx + ) @functools.total_ordering diff --git a/testsuite/MDAnalysisTests/data/lammps/mass_q_elem.lammpstrj b/testsuite/MDAnalysisTests/data/lammps/mass_q_elem.lammpstrj new file mode 100644 index 00000000000..ba6ebaec432 --- /dev/null +++ b/testsuite/MDAnalysisTests/data/lammps/mass_q_elem.lammpstrj @@ -0,0 +1,39 @@ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +30 +ITEM: BOX BOUNDS pp pp pp +-1.6350000000000001e-01 3.8836500000000001e+01 +-1.7050000000000001e-01 3.8829500000000003e+01 +-1.1550000000000001e-01 9.4884500000000003e+01 +ITEM: ATOMS id mol type x y z element q proc mass +1 1 9 6.10782 11.4384 0.798271 C -0.27 0 12.011 +2 1 6 5.73938 10.7721 1.60738 H 0.09 0 1.008 +3 1 6 5.36179 11.4598 -0.0247166 H 0.09 0 1.008 +4 1 6 6.22277 12.4656 1.20551 H 0.09 0 1.008 +5 1 7 7.41283 10.9555 0.293156 C 0.51 0 12.011 +6 1 15 8.43683 11.6139 0.447366 O -0.51 0 15.9994 +25 2 5 11.6777 7.51021 91.0484 H 0.09 0 1.008 +7 1 14 7.41086 9.75757 94.689 N -0.47 0 14.007 +8 1 1 6.55183 9.26307 94.5816 H 0.31 0 1.008 +11 1 3 9.37844 9.17237 94.7817 H 0.09 0 1.008 +16 2 5 10.6148 7.68449 93.1683 H 0.09 0 1.008 +10 1 3 8.30811 8.18016 93.7248 H 0.09 0 1.008 +9 1 8 8.58568 9.17251 94.0482 C -0.02 0 12.011 +12 1 7 9.07397 9.91809 92.8201 C 0.51 0 12.011 +13 1 15 8.35715 10.7673 92.2861 O -0.51 0 15.9994 +14 2 13 10.274 9.63057 92.3198 N -0.29 0 14.007 +15 2 12 11.1655 8.59754 92.8563 C 0 0 12.011 +17 2 5 11.7158 9.02714 93.7206 H 0.09 0 1.008 +18 2 10 10.7877 10.2091 91.0804 C 0.02 0 12.011 +19 2 2 10.8053 11.2774 91.238 H 0.09 0 1.008 +20 2 11 12.2025 9.60943 90.953 C -0.18 0 12.011 +21 2 5 12.9179 10.2786 91.4772 H 0.09 0 1.008 +22 2 5 12.5386 9.49027 89.9008 H 0.09 0 1.008 +23 2 11 12.127 8.282 91.7093 C -0.18 0 12.011 +24 2 5 13.1195 7.93535 92.0685 H 0.09 0 1.008 +26 2 7 9.90917 9.92876 89.854 C 0.51 0 12.011 +27 2 15 9.25139 8.88177 89.8474 O -0.51 0 15.9994 +28 3 13 9.84845 10.7872 88.8235 N -0.29 0 14.007 +29 3 12 10.561 12.0708 88.7803 C 0 0 12.011 +30 3 5 10.318 12.7002 89.6629 H 0.09 0 1.008 \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/data/lammps/nomass_elemx.lammpstrj b/testsuite/MDAnalysisTests/data/lammps/nomass_elemx.lammpstrj new file mode 100644 index 00000000000..93c7a5d30af --- /dev/null +++ b/testsuite/MDAnalysisTests/data/lammps/nomass_elemx.lammpstrj @@ -0,0 +1,14 @@ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +5 +ITEM: BOX BOUNDS pp pp pp +-1.6350000000000001e-01 3.8836500000000001e+01 +-1.7050000000000001e-01 3.8829500000000003e+01 +-1.1550000000000001e-01 9.4884500000000003e+01 +ITEM: ATOMS id mol type x y z q element +1 1 9 6.10782 11.4384 0.798271 -0.27 C +2 1 6 5.73938 10.7721 1.60738 0.09 X +3 1 6 5.36179 11.4598 -0.0247166 0.09 H +4 1 6 6.22277 12.4656 1.20551 0.09 H +5 1 7 7.41283 10.9555 0.293156 0.51 C \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/datafiles.py b/testsuite/MDAnalysisTests/datafiles.py index e9ef5a04682..7cdd9166ecd 100644 --- a/testsuite/MDAnalysisTests/datafiles.py +++ b/testsuite/MDAnalysisTests/datafiles.py @@ -262,6 +262,8 @@ "LAMMPSdata_additional_columns", # structure for the additional column lammpstrj "LAMMPSDUMP", "LAMMPSDUMP_long", # lammpsdump file with a few zeros sprinkled in the first column first frame + "LAMMPSDUMP_allinfo", # lammpsdump file with resids, masses, charges and element symbols + "LAMMPSDUMP_nomass_elemx", # lammps dump file with no masses, but with element symbols and one symbol 'X' "LAMMPSDUMP_allcoords", # lammpsdump file with all coordinate conventions (x,xs,xu,xsu) present, from LAMMPS rdf example "LAMMPSDUMP_nocoords", # lammpsdump file with no coordinates "LAMMPSDUMP_triclinic", # lammpsdump file to test triclinic dimension parsing, albite with most atoms deleted @@ -753,6 +755,10 @@ LAMMPSdata_PairIJ = (_data_ref / "lammps/pairij_coeffs.data.bz2").as_posix() LAMMPSDUMP = (_data_ref / "lammps/wat.lammpstrj.bz2").as_posix() LAMMPSDUMP_long = (_data_ref / "lammps/wat.lammpstrj_long.bz2").as_posix() +LAMMPSDUMP_allinfo = (_data_ref / "lammps/mass_q_elem.lammpstrj").as_posix() +LAMMPSDUMP_nomass_elemx = ( + _data_ref / "lammps/nomass_elemx.lammpstrj" +).as_posix() LAMMPSDUMP_allcoords = ( _data_ref / "lammps/spce_all_coords.lammpstrj.bz2" ).as_posix() diff --git a/testsuite/MDAnalysisTests/topology/test_lammpsdata.py b/testsuite/MDAnalysisTests/topology/test_lammpsdata.py index c5f087b89b2..d384941237a 100644 --- a/testsuite/MDAnalysisTests/topology/test_lammpsdata.py +++ b/testsuite/MDAnalysisTests/topology/test_lammpsdata.py @@ -36,6 +36,8 @@ LAMMPSdata_deletedatoms, LAMMPSDUMP, LAMMPSDUMP_long, + LAMMPSDUMP_allinfo, + LAMMPSDUMP_nomass_elemx, LAMMPSdata_PairIJ, ) @@ -306,52 +308,180 @@ def test_interpret_atom_style_missing(): ) -class TestDumpParser(ParserBase): +class LammpsDumpBase(ParserBase): + """Tests the reading of lammps dump files for topology information.""" + expected_attrs = ["types", "masses"] - expected_n_atoms = 24 - expected_n_residues = 1 expected_n_segments = 1 parser = mda.topology.LAMMPSParser.LammpsDumpParser - ref_filename = LAMMPSDUMP def test_creates_universe(self): u = mda.Universe(self.ref_filename, format="LAMMPSDUMP") assert isinstance(u, mda.Universe) - assert len(u.atoms) == 24 + assert len(u.atoms) == self.expected_n_atoms - def test_masses_warning(self): - # masses are mandatory, but badly guessed - # check that user is alerted - with self.parser(self.ref_filename) as p: - with pytest.warns(UserWarning, match="Guessed all Masses to 1.0"): - p.parse() + def test_masses(self, top): + assert_allclose(top.masses.values[:7], self.expected_masses) def test_guessed_attributes(self, filename): u = mda.Universe(filename, format="LAMMPSDUMP") for attr in self.guessed_attrs: assert hasattr(u.atoms, attr) - def test_id_ordering(self): + def test_id_ordering(self, top): # ids are nonsequential in file, but should get rearranged - u = mda.Universe(self.ref_filename, format="LAMMPSDUMP") - # the 4th in file has id==13, but should have been sorted - assert u.atoms[3].id == 4 + # Check that the ids are in order + assert_equal(top.ids.values, np.arange(1, self.expected_n_atoms + 1)) - def test_guessed_masses(self, filename): - u = mda.Universe(filename, format="LAMMPSDUMP") - expected = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] - assert_allclose(u.atoms.masses[:7], expected) + def test_types(self, top): + assert (top.types.values[:7] == self.expected_types).all() - def test_guessed_types(self, filename): - u = mda.Universe(filename, format="LAMMPSDUMP") - expected = ["2", "1", "1", "2", "1", "1", "2"] - assert (u.atoms.types[:7] == expected).all() + def test_elements(self, top): + if self.expected_elements: + assert (top.elements.values[:7] == self.expected_elements).all() + else: + assert not hasattr(top, "elements") + + def test_charges(self, top): + if self.expected_charges: + assert_allclose(top.charges.values[:7], self.expected_charges) + else: + assert not hasattr(top, "charges") + + def test_resids(self, top): + assert_equal(top.resids.values, self.expected_resids) + + +# Test that the parser works for simple dump files with no mass information +class TestDumpParserBasic(LammpsDumpBase): + expected_n_atoms = 24 + expected_n_residues = 1 + expected_resids = [1] + expected_masses = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + expected_types = ["2", "1", "1", "2", "1", "1", "2"] + # 0 denotes that charges are not to be expected + expected_charges = 0 + expected_elements = 0 + ref_filename = LAMMPSDUMP + + def test_masses_warning(self): + # masses are mandatory, but badly guessed + # check that user is alerted + with self.parser(self.ref_filename) as p: + with pytest.warns(UserWarning, match="Guessed all Masses to 1.0"): + p.parse() # this tests that topology can still be constructed if non-standard or uneven # column present. -class TestDumpParserLong(TestDumpParser): - +class TestDumpParserLong(LammpsDumpBase): + expected_n_atoms = 24 + expected_n_residues = 1 + expected_resids = [1] + expected_masses = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + expected_types = ["2", "1", "1", "2", "1", "1", "2"] + expected_charges = 0 + expected_elements = 0 ref_filename = LAMMPSDUMP_long + + def test_masses_warning(self): + # masses are mandatory, but badly guessed + # check that user is alerted + with self.parser(self.ref_filename) as p: + with pytest.warns(UserWarning, match="Guessed all Masses to 1.0"): + p.parse() + + +class TestDumpParserFull(LammpsDumpBase): + ref_filename = LAMMPSDUMP_allinfo + expected_attrs = ["types", "masses", "charges", "elements"] + expected_n_atoms = 30 + expected_n_residues = 3 + expected_resids = [1, 2, 3] + expected_masses = [12.011, 1.008, 1.008, 1.008, 12.011, 15.9994, 14.007] + expected_types = ["9", "6", "6", "6", "7", "15", "14"] + expected_charges = [-0.27, 0.09, 0.09, 0.09, 0.51, -0.51, -0.47] + expected_elements = ["C", "H", "H", "H", "C", "O", "N"] + + +# Test whether mass information is guessed correctly +# when element information is available +# and whether the user is warned about unknown elements +class TestDumpParserElementX(LammpsDumpBase): + ref_filename = LAMMPSDUMP_nomass_elemx + expected_attrs = ["types", "masses", "charges", "elements"] + expected_n_atoms = 5 + expected_n_residues = 1 + expected_resids = [1] + expected_masses = [12.011, 1, 1.008, 1.008, 12.011] + expected_types = ["9", "6", "6", "6", "7"] + expected_charges = [-0.27, 0.09, 0.09, 0.09, 0.51] + expected_elements = ["C", "", "H", "H", "C"] + + def test_elementwarn(self): + with self.parser(self.ref_filename) as p: + with pytest.warns( + UserWarning, + match="Unknown element X found for some atoms. " + "These have been given an empty element record.", + ): + p.parse() + + def test_masses_warning(self): + # masses are mandatory, but badly guessed + # check that user is alerted + with self.parser(self.ref_filename) as p: + with pytest.warns( + UserWarning, + match="No mass column found in dump file. " + "Using guessed masses from element info.", + ): + p.parse() + + +LAMMPS_DUMP_NOATOMIDS = """\ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +2 +ITEM: BOX BOUNDS pp pp pp +-1.6350000000000001e-01 3.8836500000000001e+01 +-1.7050000000000001e-01 3.8829500000000003e+01 +-1.1550000000000001e-01 9.4884500000000003e+01 +ITEM: ATOMS mol type x y z q mass +1 9 6.10782 11.4384 0.798271 -0.27 12.011 +1 6 5.73938 10.7721 1.60738 0.09 1.008 +""" + + +def test_noatomid_failure(): + with pytest.raises(ValueError, match="No id column found in dump file"): + mda.Universe(StringIO(LAMMPS_DUMP_NOATOMIDS), format="LAMMPSDUMP") + + +LAMMPS_DUMP_NOTYPES = """\ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +2 +ITEM: BOX BOUNDS pp pp pp +-1.6350000000000001e-01 3.8836500000000001e+01 +-1.7050000000000001e-01 3.8829500000000003e+01 +-1.1550000000000001e-01 9.4884500000000003e+01 +ITEM: ATOMS id mol x y z q mass +1 1 6.10782 11.4384 0.798271 -0.27 12.011 +2 1 5.73938 10.7721 1.60738 0.09 1.008 +""" + + +def test_notypes(): + u = mda.Universe(StringIO(LAMMPS_DUMP_NOTYPES), format="LAMMPSDUMP") + assert len(u.atoms) == 2 + assert_equal(u.atoms.types, np.ones(2, dtype=object)) + + +def test_notypes_warn(): + with pytest.warns(UserWarning, match="Set all atom types to 1"): + mda.Universe(StringIO(LAMMPS_DUMP_NOTYPES), format="LAMMPSDUMP")