Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ Fixes
was thrown when finding D-H pairs via the topology if `hydrogens` was an
empty AtomGroup (Issue #2848)
* Fixed the DMSParser, allowing the creation of multiple segids sharing
residues with identical resids (Issue #1387, PR #2872)
residues with identical resids (Issue #1387, PR #2872)
* H5MD files are now pickleable with H5PYPicklable (Issue #2890, PR #2894)

Enhancements
* Refactored analysis.helanal into analysis.helix_analysis
Expand Down
139 changes: 119 additions & 20 deletions package/MDAnalysis/coordinates/H5MD.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
interface of the HDF5 library to improve parallel reads and writes.

The HDF5 library and `h5py`_ must be installed; otherwise, H5MD files
cannot be read by MDAnalysis. If `h5py`_ is not installed, a
cannot be read by MDAnalysis. If `h5py`_ is not installed, a
:exc:`RuntimeError` is raised.

Units
Expand Down Expand Up @@ -94,10 +94,10 @@

Building parallel h5py and HDF5 on Linux
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Building a working parallel HDF5/h5py/mpi4py environment can be
Building a working parallel HDF5/h5py/mpi4py environment can be
challenging and is often specific to your local computing resources,
e.g., the supercomputer that you're running on typically already has
its preferred MPI installation. As a starting point we provide
its preferred MPI installation. As a starting point we provide
instructions that worked in a specific, fairly generic environment.

These instructions successfully built parallel HDF5/h5py
Expand Down Expand Up @@ -133,8 +133,8 @@
CC=mpicc HDF5_DIR=$HDF5_PATH python setup.py build
python setup.py install

If you have questions or want to share how you managed to build
parallel hdf5/h5py/mpi4py please let everyone know on the
If you have questions or want to share how you managed to build
parallel hdf5/h5py/mpi4py please let everyone know on the
`MDAnalysis forums`_.

.. _`H5MD`: https://nongnu.org/h5md/index.html
Expand Down Expand Up @@ -180,6 +180,9 @@

.. automethod:: H5MDReader._reopen

.. autoclass:: H5PYPicklable
:members:

"""

import numpy as np
Expand All @@ -190,6 +193,15 @@
import h5py
except ImportError:
HAS_H5PY = False

# Allow building documentation even if h5py is not installed
import types

class MockH5pyFile:
pass
h5py = types.ModuleType("h5py")
h5py.File = MockH5pyFile

else:
HAS_H5PY = True

Expand Down Expand Up @@ -228,23 +240,23 @@ class H5MDReader(base.ReaderBase):

See `h5md documentation <https://nongnu.org/h5md/h5md.html>`_
for a detailed overview of the H5MD file format.
The reader attempts to convert units in the trajectory file to
the standard MDAnalysis units (:mod:`MDAnalysis.units`) if

The reader attempts to convert units in the trajectory file to
the standard MDAnalysis units (:mod:`MDAnalysis.units`) if
`convert_units` is set to ``True``.

Additional data in the *observables* group of the H5MD file are
loaded into the :attr:`Timestep.data` dictionary.

Only 3D-periodic boxes or no periodicity are supported; for no
periodicity, :attr:`Timestep.dimensions` will return ``None``.

Although H5MD can store varying numbers of particles per time step
as produced by, e.g., GCMC simulations, MDAnalysis can currently
only process a fixed number of particles per step. If the number
of particles changes a :exc:`ValueError` is raised.

The :class:`H5MDReader` reads .h5md files with the following
The :class:`H5MDReader` reads .h5md files with the following
HDF5 hierarchy:

.. code-block:: text
Expand Down Expand Up @@ -277,21 +289,21 @@ class H5MDReader(base.ReaderBase):
\-- (position)
\-- [step] <int>, gives frame
\-- [time] <float>, gives time
+-- units <str>
+-- unit <str>
\-- [value] <float>, gives numpy arrary of positions
with shape (n_atoms, 3)
+-- unit <str>
\-- (velocity)
\-- [step] <int>, gives frame
\-- [time] <float>, gives time
+-- units <str>
+-- unit <str>
\-- [value] <float>, gives numpy arrary of velocities
with shape (n_atoms, 3)
+-- unit <str>
\-- (force)
\-- [step] <int>, gives frame
\-- [time] <float>, gives time
+-- units <str>
+-- unit <str>
\-- [value] <float>, gives numpy arrary of forces
with shape (n_atoms, 3)
+-- unit <str>
Expand Down Expand Up @@ -557,13 +569,15 @@ def open_trajectory(self):
if self._comm is not None:
# can only pass comm argument to h5py.File if driver='mpio'
assert self._driver == 'mpio'
self._file = h5py.File(self.filename, 'r', # pragma: no cover
driver=self._driver,
comm=self._comm)
self._file = H5PYPicklable(name=self.filename, # pragma: no cover
mode='r',
driver=self._driver,
comm=self._comm)
elif self._driver is not None:
self._file = h5py.File(self.filename, 'r', driver=self._driver)
self._file = H5PYPicklable(name=self.filename, mode='r',
driver=self._driver)
else:
self._file = h5py.File(self.filename, 'r')
self._file = H5PYPicklable(name=self.filename, mode='r')
# pulls first key out of 'particles'
# allows for arbitrary name of group1 in 'particles'
self._particle_group = self._file['particles'][
Expand Down Expand Up @@ -727,3 +741,88 @@ def has_forces(self):
@has_forces.setter
def has_forces(self, value: bool):
self._has['force'] = value

def __getstate__(self):
state = self.__dict__.copy()
del state['_particle_group']
return state

def __setstate__(self, state):
self.__dict__ = state
self._particle_group = self._file['particles'][
list(self._file['particles'])[0]]
self[self.ts.frame]


class H5PYPicklable(h5py.File):
"""H5PY file object (read-only) that can be pickled.

This class provides a file-like object (as returned by
:class:`h5py.File`) that,
unlike standard Python file objects,
can be pickled. Only read mode is supported.

When the file is pickled, filename, mode, driver, and comm of
:class:`h5py.File` in the file are saved. On unpickling, the file
is opened by filename, mode, driver. This means that for a successful
unpickle, the original file still has to be accessible with its filename.

Parameters
----------
filename : str or file-like
a filename given a text or byte string.
driver : str (optional)
H5PY file driver used to open H5MD file

Example
-------
::

f = H5PYPicklable('filename', 'r')
print(f['particles/trajectory/position/value'][0])
f.close()

can also be used as context manager::

with H5PYPicklable('filename', 'r'):
print(f['particles/trajectory/position/value'][0])

Note
----
Pickling of an `h5py.File` opened with `driver="mpio"` and an MPI
communicator is currently not supported

See Also
---------
:class:`MDAnalysis.lib.picklable_file_io.FileIOPicklable`
:class:`MDAnalysis.lib.picklable_file_io.BufferIOPicklable`
:class:`MDAnalysis.lib.picklable_file_io.TextIOPicklable`
:class:`MDAnalysis.lib.picklable_file_io.GzipPicklable`
:class:`MDAnalysis.lib.picklable_file_io.BZ2Picklable`


.. versionadded:: 2.0.0
"""

def __getstate__(self):
driver = self.driver
# Current issues: Need a way to retrieve MPI communicator object
# from self and pickle MPI.Comm object. Parallel driver is excluded
# from test because h5py calls for an MPI configuration when driver is
# 'mpio', so this will need to be patched in the test function.
if driver == 'mpio': # pragma: no cover
raise TypeError("Parallel pickling of `h5py.File` with" # pragma: no cover
" 'mpio' driver is currently not supported.")

return {'name': self.filename,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused: If the comm is not included in __getstate__ then how can it be used in __setstate__?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that you had it included in b29e675 but then removed again.

Did you test that a MPI.COMM_WORLD could be pickled on its own, i.e.,

from mpi4py import MPI
import pickle

comm = MPI.COMM_WORLD
comm_p = pickle.loads(pickle.dumps(comm))

assert comm == comm_p

or something similar?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That does not seem to work
image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t have a working mpi4py in my desktop. But it seems that it cannot be pickled natively.
This might help meanwhile https://bitbucket.org/mpi4py/mpi4py/issues/104/pickling-of-mpi-comm

'mode': self.mode,
'driver': driver}

def __setstate__(self, state):
self.__init__(name=state['name'],
mode=state['mode'],
driver=state['driver'])

def __getnewargs__(self):
"""Override the h5py getnewargs to skip its error message"""
return ()
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.. Contains the formatted docstrings for the serialization of universe located
.. Contains the formatted docstrings for the serialization of universe located
.. mainly in 'MDAnalysis/libs/pickle_file_io.py'
.. _serialization:

Expand All @@ -12,7 +12,7 @@ and what developers should do to serialize a new reader.

To make sure every Trajectory reader can be successfully
serialized, we implement picklable I/O classes (see :ref:`implemented-fileio`).
When the file is pickled, filename and other necessary attributes of the open
When the file is pickled, filename and other necessary attributes of the open
file handle are saved. On unpickling, the file is opened by filename.
This means that for a successful unpickle, the original file still has to
be accessible with its filename. To retain the current frame of the trajectory,
Expand All @@ -25,12 +25,12 @@ How to serialize a new reader

File Access
^^^^^^^^^^^
If the new reader uses :func:`util.anyopen()`
If the new reader uses :func:`util.anyopen()`
(e.g. :class:`MDAnalysis.coordinates.PDB.PDBReader`),
the reading handler can be pickled without modification.
If the new reader uses I/O classes from other package
(e.g. :class:`MDAnalysis.coordinates.GSD.GSDReader`),
and cannot be pickled natively, create a new picklable class inherited from
and cannot be pickled natively, create a new picklable class inherited from
the file class in that package
(e.g. :class:`MDAnalysis.coordinates.GSD.GSDPicklable`),
adding :func:`__getstate__`,
Expand All @@ -40,10 +40,10 @@ to allow file handler serialization.

To seek or not to seek
^^^^^^^^^^^^^^^^^^^^^^
Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file
Some I/O classes support :func:`seek` and :func:`tell` functions to allow the file
to be pickled with an offset. It is normally not needed for MDAnalysis with
random access. But if error occurs during testing, find a way to make the offset work.
Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O
Maybe this I/O class supports frame indexing? Maybe the file handler inside this I/O
class supports offset?

For example, in :class:`MDAnalysis.coordinates.TRZ.TRZReader`,
Expand Down Expand Up @@ -99,3 +99,4 @@ Currently implemented picklable IO Formats
* :class:`MDAnalysis.coordinates.GSD.GSDPicklable`
* :class:`MDAnalysis.coordinates.TRJ.NCDFPicklable`
* :class:`MDAnalysis.coordinates.chemfiles.ChemfilesPicklable`
* :class:`MDAnalysis.coordinates.H5MD.H5PYPicklable`
24 changes: 23 additions & 1 deletion testsuite/MDAnalysisTests/utils/test_pickleio.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
from MDAnalysis.coordinates.chemfiles import (
ChemfilesPicklable
)
from MDAnalysis.coordinates.H5MD import HAS_H5PY
from MDAnalysis.coordinates.H5MD import (
H5PYPicklable
)

from MDAnalysis.tests.datafiles import (
PDB,
Expand All @@ -58,7 +62,9 @@
MMTF_gz,
GMS_ASYMOPT,
GSD,
NCDF
NCDF,
TPR_xvf,
H5MD_xvf
)


Expand Down Expand Up @@ -200,3 +206,19 @@ def test_Chemfiles_with_write_mode(tmpdir):
with pytest.raises(ValueError, match=r"Only read mode"):
chemfiles_io = ChemfilesPicklable(tmpdir.mkdir("xyz").join('t.xyz'),
mode='w')


@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed")
def test_H5MD_pickle():
h5md_io = H5PYPicklable(H5MD_xvf, 'r')
h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io))
assert_equal(h5md_io['particles/trajectory/position/value'][0],
h5md_io_pickled['particles/trajectory/position/value'][0])


@pytest.mark.skipif(not HAS_H5PY, reason="h5py not installed")
def test_H5MD_pickle_with_driver():
h5md_io = H5PYPicklable(H5MD_xvf, 'r', driver='core')
h5md_io_pickled = pickle.loads(pickle.dumps(h5md_io))
assert_equal(h5md_io['particles/trajectory/position/value'][0],
h5md_io_pickled['particles/trajectory/position/value'][0])