MDAnalysis · orbeckst · Aug 12, 2020 · Aug 10, 2020 · Aug 10, 2020 · Aug 10, 2020
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -40,7 +40,8 @@ Fixes
     was thrown when finding D-H pairs via the topology if `hydrogens` was an
     empty AtomGroup (Issue #2848)
   * Fixed the DMSParser, allowing the creation of multiple segids sharing
-    residues with identical resids (Issue #1387, PR #2872)    
+    residues with identical resids (Issue #1387, PR #2872)
+  * H5MD files are now pickleable with H5PYPicklable (Issue #2890, PR #2894)
 
 Enhancements
   * Refactored analysis.helanal into analysis.helix_analysis

diff --git a/package/MDAnalysis/coordinates/H5MD.py b/package/MDAnalysis/coordinates/H5MD.py
@@ -31,7 +31,7 @@
 interface of the HDF5 library to improve parallel reads and writes.
 
 The HDF5 library and `h5py`_ must be installed; otherwise, H5MD files
-cannot be read by MDAnalysis. If `h5py`_ is not installed, a 
+cannot be read by MDAnalysis. If `h5py`_ is not installed, a
 :exc:`RuntimeError` is raised.
 
 Units
@@ -94,10 +94,10 @@
 
 Building parallel h5py and HDF5 on Linux
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Building a working parallel HDF5/h5py/mpi4py environment can be 
+Building a working parallel HDF5/h5py/mpi4py environment can be
 challenging and is often specific to your local computing resources,
 e.g., the supercomputer that you're running on typically already has
-its preferred MPI installation. As a starting point we provide 
+its preferred MPI installation. As a starting point we provide
 instructions that worked in a specific, fairly generic environment.
 
 These instructions successfully built parallel HDF5/h5py
@@ -133,8 +133,8 @@
           CC=mpicc HDF5_DIR=$HDF5_PATH python setup.py build
           python setup.py install
 
-If you have questions or want to share how you managed to build 
-parallel hdf5/h5py/mpi4py please let everyone know on the 
+If you have questions or want to share how you managed to build
+parallel hdf5/h5py/mpi4py please let everyone know on the
 `MDAnalysis forums`_.
 
 .. _`H5MD`: https://nongnu.org/h5md/index.html
@@ -180,6 +180,9 @@
 
    .. automethod:: H5MDReader._reopen
 
+.. autoclass:: H5PYPicklable
+   :members:
+
 """
 
 import numpy as np
@@ -190,6 +193,15 @@
     import h5py
 except ImportError:
     HAS_H5PY = False
+
+    # Allow building documentation even if h5py is not installed
+    import types
+
+    class MockH5pyFile:
+        pass
+    h5py = types.ModuleType("h5py")
+    h5py.File = MockH5pyFile
+
 else:
     HAS_H5PY = True
 
@@ -228,23 +240,23 @@ class H5MDReader(base.ReaderBase):
 
     See `h5md documentation <https://nongnu.org/h5md/h5md.html>`_
     for a detailed overview of the H5MD file format.
-    
-    The reader attempts to convert units in the trajectory file to 
-    the standard MDAnalysis units (:mod:`MDAnalysis.units`) if 
+
+    The reader attempts to convert units in the trajectory file to
+    the standard MDAnalysis units (:mod:`MDAnalysis.units`) if
     `convert_units` is set to ``True``.
-    
+
     Additional data in the *observables* group of the H5MD file are
     loaded into the :attr:`Timestep.data` dictionary.
-    
+
     Only 3D-periodic boxes or no periodicity are supported; for no
     periodicity, :attr:`Timestep.dimensions` will return ``None``.
-    
+
     Although H5MD can store varying numbers of particles per time step
     as produced by, e.g., GCMC simulations, MDAnalysis can currently
     only process a fixed number of particles per step. If the number
     of particles changes a :exc:`ValueError` is raised.
 
-    The :class:`H5MDReader` reads .h5md files with the following 
+    The :class:`H5MDReader` reads .h5md files with the following
     HDF5 hierarchy:
 
     .. code-block:: text
@@ -277,21 +289,21 @@ class H5MDReader(base.ReaderBase):
                 \-- (position)
                     \-- [step] <int>, gives frame
                     \-- [time] <float>, gives time
-                        +-- units <str>
+                        +-- unit <str>
                     \-- [value] <float>, gives numpy arrary of positions
                                          with shape (n_atoms, 3)
                         +-- unit <str>
                 \-- (velocity)
                     \-- [step] <int>, gives frame
                     \-- [time] <float>, gives time
-                        +-- units <str>
+                        +-- unit <str>
                     \-- [value] <float>, gives numpy arrary of velocities
                                          with shape (n_atoms, 3)
                         +-- unit <str>
                 \-- (force)
                     \-- [step] <int>, gives frame
                     \-- [time] <float>, gives time
-                        +-- units <str>
+                        +-- unit <str>
                     \-- [value] <float>, gives numpy arrary of forces
                                          with shape (n_atoms, 3)
                         +-- unit <str>
@@ -557,13 +569,15 @@ def open_trajectory(self):
             if self._comm is not None:
                 # can only pass comm argument to h5py.File if driver='mpio'
                 assert self._driver == 'mpio'
-                self._file = h5py.File(self.filename, 'r',  # pragma: no cover
-                                       driver=self._driver,
-                                       comm=self._comm)
+                self._file = H5PYPicklable(name=self.filename,  # pragma: no cover
+                                           mode='r',
+                                           driver=self._driver,
+                                           comm=self._comm)
             elif self._driver is not None:
-                self._file = h5py.File(self.filename, 'r', driver=self._driver)
+                self._file = H5PYPicklable(name=self.filename, mode='r',
+                                           driver=self._driver)
             else:
-                self._file = h5py.File(self.filename, 'r')
+                self._file = H5PYPicklable(name=self.filename, mode='r')
         # pulls first key out of 'particles'
         # allows for arbitrary name of group1 in 'particles'
         self._particle_group = self._file['particles'][
@@ -727,3 +741,88 @@ def has_forces(self):
     @has_forces.setter
     def has_forces(self, value: bool):
         self._has['force'] = value
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state['_particle_group']
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self._particle_group = self._file['particles'][
+                               list(self._file['particles'])[0]]
+        self[self.ts.frame]
+
+
+class H5PYPicklable(h5py.File):
+    """H5PY file object (read-only) that can be pickled.
+
+    This class provides a file-like object (as returned by
+    :class:`h5py.File`) that,
+    unlike standard Python file objects,
+    can be pickled. Only read mode is supported.
+
+    When the file is pickled, filename, mode, driver, and comm of
+    :class:`h5py.File` in the file are saved. On unpickling, the file
+    is opened by filename, mode, driver. This means that for a successful
+    unpickle, the original file still has to be accessible with its filename.
+
+    Parameters
+    ----------
+    filename : str or file-like
+        a filename given a text or byte string.
+    driver : str (optional)
+        H5PY file driver used to open H5MD file
+
+    Example
+    -------
+    ::
+
+        f = H5PYPicklable('filename', 'r')
+        print(f['particles/trajectory/position/value'][0])
+        f.close()
+
+    can also be used as context manager::
+
+        with H5PYPicklable('filename', 'r'):
+            print(f['particles/trajectory/position/value'][0])
+
+    Note
+    ----
+    Pickling of an `h5py.File` opened with `driver="mpio"` and an MPI
+    communicator is currently not supported
+
+    See Also
+    ---------
+    :class:`MDAnalysis.lib.picklable_file_io.FileIOPicklable`
+    :class:`MDAnalysis.lib.picklable_file_io.BufferIOPicklable`
+    :class:`MDAnalysis.lib.picklable_file_io.TextIOPicklable`
+    :class:`MDAnalysis.lib.picklable_file_io.GzipPicklable`
+    :class:`MDAnalysis.lib.picklable_file_io.BZ2Picklable`
+
+
+    .. versionadded:: 2.0.0
+    """
+
+    def __getstate__(self):
+        driver = self.driver
+        # Current issues: Need a way to retrieve MPI communicator object
+        # from self and pickle MPI.Comm object. Parallel driver is excluded
+        # from test because h5py calls for an MPI configuration when driver is
+        # 'mpio', so this will need to be patched in the test function.
+        if driver == 'mpio':  # pragma: no cover
+            raise TypeError("Parallel pickling of `h5py.File` with"  # pragma: no cover
+                            " 'mpio' driver is currently not supported.")
+
+        return {'name': self.filename,
+                'mode': self.mode,
+                'driver': driver}
+
+    def __setstate__(self, state):
+        self.__init__(name=state['name'],
+                      mode=state['mode'],
+                      driver=state['driver'])
+
+    def __getnewargs__(self):
+        """Override the h5py getnewargs to skip its error message"""
+        return ()
diff --git a/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst b/package/doc/sphinx/source/documentation_pages/coordinates/pickle_readers.rst
@@ -1,4 +1,4 @@
-.. Contains the formatted docstrings for the serialization of universe located 
+.. Contains the formatted docstrings for the serialization of universe located
 .. mainly in 'MDAnalysis/libs/pickle_file_io.py'
 .. _serialization:
 
@@ -12,7 +12,7 @@ and what developers should do to serialize a new reader.
 
 To make sure every Trajectory reader can be successfully
 serialized, we implement picklable I/O classes (see :ref:`implemented-fileio`).
-When the file is pickled, filename and other necessary attributes of the open 
+When the file is pickled, filename and other necessary attributes of the open
 file handle are saved. On unpickling, the file is opened by filename.
 This means that for a successful unpickle, the original file still has to
 be accessible with its filename. To retain the current frame of the trajectory,
@@ -25,12 +25,12 @@ How to serialize a new reader
 
 File Access
 ^^^^^^^^^^^
-If the new reader uses :func:`util.anyopen()` 
+If the new reader uses :func:`util.anyopen()`
 (e.g. :class:`MDAnalysis.coordinates.PDB.PDBReader`),
 the reading handler can be pickled without modification.
 If the new reader uses I/O classes from other package
 (e.g. :class:`MDAnalysis.coordinates.GSD.GSDReader`),
-and cannot be pickled natively, create a new picklable class inherited from 
+and cannot be pickled natively, create a new picklable class inherited from
 the file class in that package
 (e.g. :class:`MDAnalysis.coordinates.GSD.GSDPicklable`),
 adding :func:`__getstate__`,
@@ -40,10 +40,10 @@ to allow file handler serialization.
 
 To seek or not to seek
 ^^^^^^^^^^^^^^^^^^^^^^
-Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file 
+Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file
 to be pickled with an offset. It is normally not needed for MDAnalysis with
 random access. But if error occurs during testing, find a way to make the offset work.
-Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O 
+Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O
 class supports offset?
 
 For example, in :class:`MDAnalysis.coordinates.TRZ.TRZReader`,
@@ -99,3 +99,4 @@ Currently implemented picklable IO Formats
 * :class:`MDAnalysis.coordinates.GSD.GSDPicklable`
 * :class:`MDAnalysis.coordinates.TRJ.NCDFPicklable`
 * :class:`MDAnalysis.coordinates.chemfiles.ChemfilesPicklable`
+* :class:`MDAnalysis.coordinates.H5MD.H5PYPicklable`
diff --git a/testsuite/MDAnalysisTests/utils/test_pickleio.py b/testsuite/MDAnalysisTests/utils/test_pickleio.py
@@ -50,6 +50,10 @@
     from MDAnalysis.coordinates.chemfiles import (
         ChemfilesPicklable
     )
+from MDAnalysis.coordinates.H5MD import HAS_H5PY
+from MDAnalysis.coordinates.H5MD import (
+    H5PYPicklable
+)
 
 from MDAnalysis.tests.datafiles import (
     PDB,
@@ -58,7 +62,9 @@
     MMTF_gz,
     GMS_ASYMOPT,
     GSD,
-    NCDF
+    NCDF,
+    TPR_xvf,
+    H5MD_xvf
 )
 
 
@@ -200,3 +206,19 @@ def test_Chemfiles_with_write_mode(tmpdir):
     with pytest.raises(ValueError, match=r"Only read mode"):
         chemfiles_io = ChemfilesPicklable(tmpdir.mkdir("xyz").join('t.xyz'),
                                           mode='w')
+
+
+@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed")
+def test_H5MD_pickle():
+    h5md_io = H5PYPicklable(H5MD_xvf, 'r')
+    h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io))
+    assert_equal(h5md_io['particles/trajectory/position/value'][0],
+                 h5md_io_pickled['particles/trajectory/position/value'][0])
+
+
+@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed")
+def test_H5MD_pickle_with_driver():
+    h5md_io = H5PYPicklable(H5MD_xvf, 'r', driver='core')
+    h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io))
+    assert_equal(h5md_io['particles/trajectory/position/value'][0],
+                 h5md_io_pickled['particles/trajectory/position/value'][0])