diff --git a/package/CHANGELOG b/package/CHANGELOG index d4c412247e0..ce83b67aed8 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -40,7 +40,8 @@ Fixes was thrown when finding D-H pairs via the topology if `hydrogens` was an empty AtomGroup (Issue #2848) * Fixed the DMSParser, allowing the creation of multiple segids sharing - residues with identical resids (Issue #1387, PR #2872) + residues with identical resids (Issue #1387, PR #2872) + * H5MD files are now pickleable with H5PYPicklable (Issue #2890, PR #2894) Enhancements * Refactored analysis.helanal into analysis.helix_analysis diff --git a/package/MDAnalysis/coordinates/H5MD.py b/package/MDAnalysis/coordinates/H5MD.py index bf33eec169b..a49594282d5 100644 --- a/package/MDAnalysis/coordinates/H5MD.py +++ b/package/MDAnalysis/coordinates/H5MD.py @@ -31,7 +31,7 @@ interface of the HDF5 library to improve parallel reads and writes. The HDF5 library and `h5py`_ must be installed; otherwise, H5MD files -cannot be read by MDAnalysis. If `h5py`_ is not installed, a +cannot be read by MDAnalysis. If `h5py`_ is not installed, a :exc:`RuntimeError` is raised. Units @@ -94,10 +94,10 @@ Building parallel h5py and HDF5 on Linux ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Building a working parallel HDF5/h5py/mpi4py environment can be +Building a working parallel HDF5/h5py/mpi4py environment can be challenging and is often specific to your local computing resources, e.g., the supercomputer that you're running on typically already has -its preferred MPI installation. As a starting point we provide +its preferred MPI installation. As a starting point we provide instructions that worked in a specific, fairly generic environment. These instructions successfully built parallel HDF5/h5py @@ -133,8 +133,8 @@ CC=mpicc HDF5_DIR=$HDF5_PATH python setup.py build python setup.py install -If you have questions or want to share how you managed to build -parallel hdf5/h5py/mpi4py please let everyone know on the +If you have questions or want to share how you managed to build +parallel hdf5/h5py/mpi4py please let everyone know on the `MDAnalysis forums`_. .. _`H5MD`: https://nongnu.org/h5md/index.html @@ -180,6 +180,9 @@ .. automethod:: H5MDReader._reopen +.. autoclass:: H5PYPicklable + :members: + """ import numpy as np @@ -190,6 +193,15 @@ import h5py except ImportError: HAS_H5PY = False + + # Allow building documentation even if h5py is not installed + import types + + class MockH5pyFile: + pass + h5py = types.ModuleType("h5py") + h5py.File = MockH5pyFile + else: HAS_H5PY = True @@ -228,23 +240,23 @@ class H5MDReader(base.ReaderBase): See `h5md documentation `_ for a detailed overview of the H5MD file format. - - The reader attempts to convert units in the trajectory file to - the standard MDAnalysis units (:mod:`MDAnalysis.units`) if + + The reader attempts to convert units in the trajectory file to + the standard MDAnalysis units (:mod:`MDAnalysis.units`) if `convert_units` is set to ``True``. - + Additional data in the *observables* group of the H5MD file are loaded into the :attr:`Timestep.data` dictionary. - + Only 3D-periodic boxes or no periodicity are supported; for no periodicity, :attr:`Timestep.dimensions` will return ``None``. - + Although H5MD can store varying numbers of particles per time step as produced by, e.g., GCMC simulations, MDAnalysis can currently only process a fixed number of particles per step. If the number of particles changes a :exc:`ValueError` is raised. - The :class:`H5MDReader` reads .h5md files with the following + The :class:`H5MDReader` reads .h5md files with the following HDF5 hierarchy: .. code-block:: text @@ -277,21 +289,21 @@ class H5MDReader(base.ReaderBase): \-- (position) \-- [step] , gives frame \-- [time] , gives time - +-- units + +-- unit \-- [value] , gives numpy arrary of positions with shape (n_atoms, 3) +-- unit \-- (velocity) \-- [step] , gives frame \-- [time] , gives time - +-- units + +-- unit \-- [value] , gives numpy arrary of velocities with shape (n_atoms, 3) +-- unit \-- (force) \-- [step] , gives frame \-- [time] , gives time - +-- units + +-- unit \-- [value] , gives numpy arrary of forces with shape (n_atoms, 3) +-- unit @@ -557,13 +569,15 @@ def open_trajectory(self): if self._comm is not None: # can only pass comm argument to h5py.File if driver='mpio' assert self._driver == 'mpio' - self._file = h5py.File(self.filename, 'r', # pragma: no cover - driver=self._driver, - comm=self._comm) + self._file = H5PYPicklable(name=self.filename, # pragma: no cover + mode='r', + driver=self._driver, + comm=self._comm) elif self._driver is not None: - self._file = h5py.File(self.filename, 'r', driver=self._driver) + self._file = H5PYPicklable(name=self.filename, mode='r', + driver=self._driver) else: - self._file = h5py.File(self.filename, 'r') + self._file = H5PYPicklable(name=self.filename, mode='r') # pulls first key out of 'particles' # allows for arbitrary name of group1 in 'particles' self._particle_group = self._file['particles'][ @@ -727,3 +741,88 @@ def has_forces(self): @has_forces.setter def has_forces(self, value: bool): self._has['force'] = value + + def __getstate__(self): + state = self.__dict__.copy() + del state['_particle_group'] + return state + + def __setstate__(self, state): + self.__dict__ = state + self._particle_group = self._file['particles'][ + list(self._file['particles'])[0]] + self[self.ts.frame] + + +class H5PYPicklable(h5py.File): + """H5PY file object (read-only) that can be pickled. + + This class provides a file-like object (as returned by + :class:`h5py.File`) that, + unlike standard Python file objects, + can be pickled. Only read mode is supported. + + When the file is pickled, filename, mode, driver, and comm of + :class:`h5py.File` in the file are saved. On unpickling, the file + is opened by filename, mode, driver. This means that for a successful + unpickle, the original file still has to be accessible with its filename. + + Parameters + ---------- + filename : str or file-like + a filename given a text or byte string. + driver : str (optional) + H5PY file driver used to open H5MD file + + Example + ------- + :: + + f = H5PYPicklable('filename', 'r') + print(f['particles/trajectory/position/value'][0]) + f.close() + + can also be used as context manager:: + + with H5PYPicklable('filename', 'r'): + print(f['particles/trajectory/position/value'][0]) + + Note + ---- + Pickling of an `h5py.File` opened with `driver="mpio"` and an MPI + communicator is currently not supported + + See Also + --------- + :class:`MDAnalysis.lib.picklable_file_io.FileIOPicklable` + :class:`MDAnalysis.lib.picklable_file_io.BufferIOPicklable` + :class:`MDAnalysis.lib.picklable_file_io.TextIOPicklable` + :class:`MDAnalysis.lib.picklable_file_io.GzipPicklable` + :class:`MDAnalysis.lib.picklable_file_io.BZ2Picklable` + + + .. versionadded:: 2.0.0 + """ + + def __getstate__(self): + driver = self.driver + # Current issues: Need a way to retrieve MPI communicator object + # from self and pickle MPI.Comm object. Parallel driver is excluded + # from test because h5py calls for an MPI configuration when driver is + # 'mpio', so this will need to be patched in the test function. + if driver == 'mpio': # pragma: no cover + raise TypeError("Parallel pickling of `h5py.File` with" # pragma: no cover + " 'mpio' driver is currently not supported.") + + return {'name': self.filename, + 'mode': self.mode, + 'driver': driver} + + def __setstate__(self, state): + self.__init__(name=state['name'], + mode=state['mode'], + driver=state['driver']) + + def __getnewargs__(self): + """Override the h5py getnewargs to skip its error message""" + return () diff --git a/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst b/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst index 0866590297f..fbcc881ab0c 100644 --- a/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst +++ b/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst @@ -1,4 +1,4 @@ -.. Contains the formatted docstrings for the serialization of universe located +.. Contains the formatted docstrings for the serialization of universe located .. mainly in 'MDAnalysis/libs/pickle_file_io.py' .. _serialization: @@ -12,7 +12,7 @@ and what developers should do to serialize a new reader. To make sure every Trajectory reader can be successfully serialized, we implement picklable I/O classes (see :ref:`implemented-fileio`). -When the file is pickled, filename and other necessary attributes of the open +When the file is pickled, filename and other necessary attributes of the open file handle are saved. On unpickling, the file is opened by filename. This means that for a successful unpickle, the original file still has to be accessible with its filename. To retain the current frame of the trajectory, @@ -25,12 +25,12 @@ How to serialize a new reader File Access ^^^^^^^^^^^ -If the new reader uses :func:`util.anyopen()` +If the new reader uses :func:`util.anyopen()` (e.g. :class:`MDAnalysis.coordinates.PDB.PDBReader`), the reading handler can be pickled without modification. If the new reader uses I/O classes from other package (e.g. :class:`MDAnalysis.coordinates.GSD.GSDReader`), -and cannot be pickled natively, create a new picklable class inherited from +and cannot be pickled natively, create a new picklable class inherited from the file class in that package (e.g. :class:`MDAnalysis.coordinates.GSD.GSDPicklable`), adding :func:`__getstate__`, @@ -40,10 +40,10 @@ to allow file handler serialization. To seek or not to seek ^^^^^^^^^^^^^^^^^^^^^^ -Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file +Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file to be pickled with an offset. It is normally not needed for MDAnalysis with random access. But if error occurs during testing, find a way to make the offset work. -Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O +Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O class supports offset? For example, in :class:`MDAnalysis.coordinates.TRZ.TRZReader`, @@ -99,3 +99,4 @@ Currently implemented picklable IO Formats * :class:`MDAnalysis.coordinates.GSD.GSDPicklable` * :class:`MDAnalysis.coordinates.TRJ.NCDFPicklable` * :class:`MDAnalysis.coordinates.chemfiles.ChemfilesPicklable` +* :class:`MDAnalysis.coordinates.H5MD.H5PYPicklable` diff --git a/testsuite/MDAnalysisTests/utils/test_pickleio.py b/testsuite/MDAnalysisTests/utils/test_pickleio.py index b005f478c12..bbc7a5a5f23 100644 --- a/testsuite/MDAnalysisTests/utils/test_pickleio.py +++ b/testsuite/MDAnalysisTests/utils/test_pickleio.py @@ -50,6 +50,10 @@ from MDAnalysis.coordinates.chemfiles import ( ChemfilesPicklable ) +from MDAnalysis.coordinates.H5MD import HAS_H5PY +from MDAnalysis.coordinates.H5MD import ( + H5PYPicklable +) from MDAnalysis.tests.datafiles import ( PDB, @@ -58,7 +62,9 @@ MMTF_gz, GMS_ASYMOPT, GSD, - NCDF + NCDF, + TPR_xvf, + H5MD_xvf ) @@ -200,3 +206,19 @@ def test_Chemfiles_with_write_mode(tmpdir): with pytest.raises(ValueError, match=r"Only read mode"): chemfiles_io = ChemfilesPicklable(tmpdir.mkdir("xyz").join('t.xyz'), mode='w') + + +@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed") +def test_H5MD_pickle(): + h5md_io = H5PYPicklable(H5MD_xvf, 'r') + h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io)) + assert_equal(h5md_io['particles/trajectory/position/value'][0], + h5md_io_pickled['particles/trajectory/position/value'][0]) + + +@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed") +def test_H5MD_pickle_with_driver(): + h5md_io = H5PYPicklable(H5MD_xvf, 'r', driver='core') + h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io)) + assert_equal(h5md_io['particles/trajectory/position/value'][0], + h5md_io_pickled['particles/trajectory/position/value'][0])