MDAnalysis · orbeckst · Aug 19, 2019 · Aug 9, 2019 · Aug 10, 2019 · Aug 11, 2019
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -65,6 +65,7 @@ Enhancements
   * add high order water bridge support to water bridge analysis. (PR #2087)
 
 Changes
+  * added support for scale_factor in NCDFReader (Issue #2323)
   * added official support for Python 3.7 (PR #1963)
   * stopped official support of Python 3.4 (#2066, PR #2174)
   * In lib.mdamath, calculations within triclinic_box(), triclinic_vectors(),
@@ -76,6 +77,7 @@ Changes
   * changed the water bridge analysis output format (PR #2087)
 
 Fixes
+  * fixed lack of check for scaling of NCDFReader velocities (Issue #2323)
   * fixed PDBReader and PDBWriter newlines for PDB header (Issue #2324)
   * fixes ProgressMeter issues with older Jupyter Lab versions (Issue #2078)
   * fixes ProgressMeter behaviour for non-AnalysisBase methods (Issue #2084)

diff --git a/package/MDAnalysis/coordinates/TRJ.py b/package/MDAnalysis/coordinates/TRJ.py
@@ -154,7 +154,7 @@
 """
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
-
+import scipy.io.netcdf
 import numpy as np
 import warnings
 import errno
@@ -167,14 +167,14 @@
 
 logger = logging.getLogger("MDAnalysis.coordinates.AMBER")
 
-import scipy.io.netcdf
 
 try:
     import netCDF4
 except ImportError:
     netCDF4 = None
     logger.warning("netCDF4 is not available. Writing AMBER ncdf files will be slow.")
 
+
 class Timestep(base.Timestep):
     """AMBER trajectory Timestep.
 
@@ -417,12 +417,13 @@ class NCDFReader(base.ReaderBase):
     Periodic unit cell information is detected and used to populate the
     :attr:`Timestep.dimensions` attribute. (If no unit cell is available in
     the trajectory, then :attr:`Timestep.dimensions` will return
-    ``[0,0,0,0,0,0]``.)
+    ``[0,0,0,0,0,0]``).
 
     Current limitations:
 
     * only trajectories with time in ps and lengths in Angstroem are processed
-    * scale_factors are not supported (and not checked)
+    * scale_factors are supported on read but are currently not kept/used when
+      writing
 
     The NCDF reader uses :mod:`scipy.io.netcdf` and therefore :mod:`scipy` must
     be installed. It supports the *mmap* keyword argument (when reading):
@@ -448,6 +449,17 @@ class NCDFReader(base.ReaderBase):
        kwarg `delta` renamed to `dt`, for uniformity with other Readers.
     .. versionchanged:: 0.17.0
        Uses :mod:`scipy.io.netcdf` and supports the *mmap* kwarg.
+    .. versionchanged:: 0.20.0
+       Now reads scale_factors for all expected AMBER convention variables.
+       Timestep variables now adhere standard MDAnalysis units, with lengths
+       of angstrom, time of ps, velocity of angstrom/ps and force of
+       kJ/(mol*Angstrom). It is noted that with 0.19.2 and earlier versions,
+       velocities would have often been reported in values of angstrom/AKMA
+       time units instead (Issue #2323).
+
+    .. TODO:
+       * Remove support for `degrees` units in MDAnalysis version > 1.0
+         (Issue #2327, PR #2326).
 
     """
 
@@ -458,74 +470,168 @@ class NCDFReader(base.ReaderBase):
              'length': 'Angstrom',
              'velocity': 'Angstrom/ps',
              'force': 'kcal/(mol*Angstrom)'}
+
     _Timestep = Timestep
 
     def __init__(self, filename, n_atoms=None, mmap=None, **kwargs):
+
         self._mmap = mmap
 
         super(NCDFReader, self).__init__(filename, **kwargs)
 
         self.trjfile = scipy.io.netcdf.netcdf_file(self.filename,
                                                    mmap=self._mmap)
 
-        if not ('AMBER' in self.trjfile.Conventions.decode('utf-8').split(',') or
-                'AMBER' in self.trjfile.Conventions.decode('utf-8').split()):
+        # AMBER NetCDF files should always have a convention
+        try:
+            conventions = self.trjfile.Conventions
+            if not ('AMBER' in conventions.decode('utf-8').split(',') or
+                    'AMBER' in conventions.decode('utf-8').split()):
+                errmsg = ("NCDF trajectory {0} does not conform to AMBER "
+                          "specifications, "
+                          "http://ambermd.org/netcdf/nctraj.xhtml "
+                          "('AMBER' must be one of the token in attribute "
+                          "Conventions)".format(self.filename))
+                logger.fatal(errmsg)
+                raise TypeError(errmsg)
+        except AttributeError:
+            errmsg = "NCDF trajectory {0} is missing Conventions".format(
+                      self.filename)
+            logger.fatal(errmsg)
+            raise ValueError(errmsg)
+
+        # AMBER NetCDF files should also have a ConventionVersion
+        try:
+            ConventionVersion = self.trjfile.ConventionVersion.decode('utf-8')
+            if not ConventionVersion == self.version:
+                wmsg = ("NCDF trajectory format is {0!s} but the reader "
+                        "implements format {1!s}".format(
+                         ConventionVersion, self.version))
+                warnings.warn(wmsg)
+                logger.warning(wmsg)
+        except AttributeError:
+            errmsg = "NCDF trajectory {0} is missing ConventionVersion".format(
+                      self.filename)
+            raise ValueError(errmsg)
+
+        # The AMBER NetCDF standard enforces 64 bit offsets
+        if not self.trjfile.version_byte == 2:
             errmsg = ("NCDF trajectory {0} does not conform to AMBER "
-                      "specifications, http://ambermd.org/netcdf/nctraj.xhtml "
-                      "('AMBER' must be one of the tokens in attribute "
-                      "Conventions)".format(self.filename))
+                      "specifications, as detailed in "
+                      "https://ambermd.org/netcdf/nctraj.xhtml "
+                      "(NetCDF file does not use 64 bit offsets "
+                      "[version_byte = 2])".format(self.filename))
             logger.fatal(errmsg)
             raise TypeError(errmsg)
-        if not self.trjfile.ConventionVersion.decode('utf-8') == self.version:
-            wmsg = ("NCDF trajectory format is {0!s} but the reader "
-                    "implements format {1!s}".format(
-                        self.trjfile.ConventionVersion, self.version))
+
+        # The AMBER NetCDF standard enforces 3D coordinates
+        try:
+            if not self.trjfile.dimensions['spatial'] == 3:
+                errmsg = "Incorrect spatial value for NCDF trajectory file"
+                raise TypeError(errmsg)
+        except KeyError:
+            errmsg = "NCDF trajectory does not contain spatial dimension"
+            raise ValueError(errmsg)
+
+        # AMBER NetCDF specs require program and programVersion. Warn users
+        # if those attributes do not exist
+        if not (hasattr(self.trjfile, 'program') and
+                hasattr(self.trjfile, 'programVersion')):
+            wmsg = ("NCDF trajectory {0} may not fully adhere to AMBER "
+                    "standards as either the `program` or `programVersion` "
+                    "attributes are missing".format(self.filename))
             warnings.warn(wmsg)
             logger.warning(wmsg)
 
-        self.n_atoms = self.trjfile.dimensions['atom']
-        self.n_frames = self.trjfile.dimensions['frame']
-        # example trajectory when read with scipy.io.netcdf has
-        # dimensions['frame'] == None (indicating a record dimension that can
-        # grow) whereas if read with netCDF4 I get len(dimensions['frame']) ==
-        # 10: in any case, we need to get the number of frames from somewhere
-        # such as the time variable:
-        if self.n_frames is None:
-            self.n_frames = self.trjfile.variables['time'].shape[0]
+        try:
+            self.n_atoms = self.trjfile.dimensions['atom']
+            if n_atoms is not None and n_atoms != self.n_atoms:
+                errmsg = ("Supplied n_atoms ({0}) != natom from ncdf ({1}). "
+                          "Note: n_atoms can be None and then the ncdf value "
+                          "is used!".format(n_atoms, self.n_atoms))
+                raise ValueError(errmsg)
+        except KeyError:
+            errmsg = ("NCDF trajectory {0} does not contain atom "
+                      "information".format(self.filename))
+            raise ValueError(errmsg)
+
+        try:
+            self.n_frames = self.trjfile.dimensions['frame']
+            # example trajectory when read with scipy.io.netcdf has
+            # dimensions['frame'] == None (indicating a record dimension that
+            # can grow) whereas if read with netCDF4 I get
+            # len(dimensions['frame']) ==  10: in any case, we need to get
+            # the number of frames from somewhere such as the time variable:
+            if self.n_frames is None:
+                self.n_frames = self.trjfile.variables['time'].shape[0]
+        except KeyError:
+            errmsg = ("NCDF trajectory {0} does not contain frame "
+                      "information".format(self.filename))
+            raise ValueError(errmsg)
 
         try:
             self.remarks = self.trjfile.title
         except AttributeError:
             self.remarks = ""
         # other metadata (*= requd):
-        # - program*              sander
-        # - programVersion*       9.0
         # - application           AMBER
         #
 
         # checks for not-implemented features (other units would need to be
         # hacked into MDAnalysis.units)
-        if self.trjfile.variables['time'].units.decode('utf-8') != "picosecond":
-            raise NotImplementedError(
-                "NETCDFReader currently assumes that the trajectory was written "
-                "with a time unit of picoseconds and not {0}.".format(
-                    self.trjfile.variables['time'].units))
-        if self.trjfile.variables['coordinates'].units.decode('utf-8') != "angstrom":
-            raise NotImplementedError(
-                "NETCDFReader currently assumes that the trajectory was written "
-                "with a length unit of Angstroem and not {0}.".format(
-                    self.trjfile.variables['coordinates'].units))
-        if hasattr(self.trjfile.variables['coordinates'], 'scale_factor'):
-            raise NotImplementedError("scale_factors are not implemented")
-        if n_atoms is not None and n_atoms != self.n_atoms:
-            raise ValueError("Supplied n_atoms ({0}) != natom from ncdf ({1}). "
-                             "Note: n_atoms can be None and then the ncdf value "
-                             "is used!".format(n_atoms, self.n_atoms))
+        self._verify_units(self.trjfile.variables['time'].units, 'picosecond')
+        self._verify_units(self.trjfile.variables['coordinates'].units,
+                           'angstrom')
+
+        # Check for scale_factor attributes for all data variables and
+        # store this to multiply through later (Issue #2323)
+        self.scale_factors = {'time': 1.0,
+                              'cell_lengths': 1.0,
+                              'cell_angles': 1.0,
+                              'coordinates': 1.0,
+                              'velocities': 1.0,
+                              'forces': 1.0}
+
+        for variable in self.trjfile.variables:
+            if hasattr(self.trjfile.variables[variable], 'scale_factor'):
+                if variable in self.scale_factors:
+                    scale_factor = self.trjfile.variables[variable].scale_factor
+                    self.scale_factors[variable] = scale_factor
+                else:
+                    errmsg = ("scale_factors for variable {0} are "
+                              "not implemented".format(variable))
+                    raise NotImplementedError(errmsg)
 
         self.has_velocities = 'velocities' in self.trjfile.variables
+        if self.has_velocities:
+            self._verify_units(self.trjfile.variables['velocities'].units,
+                               'angstrom/picosecond')
+
         self.has_forces = 'forces' in self.trjfile.variables
+        if self.has_forces:
+            self._verify_units(self.trjfile.variables['forces'].units,
+                               'kilocalorie/mole/angstrom')
 
         self.periodic = 'cell_lengths' in self.trjfile.variables
+        if self.periodic:
+            self._verify_units(self.trjfile.variables['cell_lengths'].units,
+                               'angstrom')
+            # Currently the MDA writer outputs 'degrees' rather than the
+            # singular form 'degree' agreed in the conventions. As discussed
+            # in PR #2326 from MDA 1.0 only and 'degree' will be accepted.
+            cell_angle_units = self.trjfile.variables['cell_angles'].units
+            if (cell_angle_units.decode('utf-8') == 'degrees'):
+                wmsg = ("DEPRECATED (1.0): NCDF trajectory {0} uses units of "
+                        "`degrees` for the `cell_angles` variable instead of "
+                        "`degree`. Support for non-AMBER convention units is "
+                        "now deprecated and will end in MDAnalysis version "
+                        "1.0. Afterwards, reading this file will raise an "
+                        "error.")
+                warnings.warn(wmsg, category=DeprecationWarning)
+                logger.warning(wmsg)
+            else:
+                self._verify_units(cell_angle_units, 'degree')
+
         self._current_frame = 0
 
         self.ts = self._Timestep(self.n_atoms,
@@ -537,6 +643,14 @@ def __init__(self, filename, n_atoms=None, mmap=None, **kwargs):
         # load first data frame
         self._read_frame(0)
 
+    @staticmethod
+    def _verify_units(eval_unit, expected_units):
+        if eval_unit.decode('utf-8') != expected_units:
+            errmsg = ("NETCDFReader currently assumes that the trajectory "
+                      "was written in units of {0} instead of {1}".format(
+                       eval_unit.decode('utf-8'), expected_units))
+            raise NotImplementedError(errmsg)
+
     @staticmethod
     def parse_n_atoms(filename, **kwargs):
         with scipy.io.netcdf.netcdf_file(filename, mmap=None) as f:
@@ -555,15 +669,21 @@ def _read_frame(self, frame):
             raise IndexError("frame index must be 0 <= frame < {0}".format(
                 self.n_frames))
         # note: self.trjfile.variables['coordinates'].shape == (frames, n_atoms, 3)
-        ts._pos[:] = self.trjfile.variables['coordinates'][frame]
-        ts.time = self.trjfile.variables['time'][frame]
+        ts._pos[:] = (self.trjfile.variables['coordinates'][frame] *
+                      self.scale_factors['coordinates'])
+        ts.time = (self.trjfile.variables['time'][frame] *
+                   self.scale_factors['time'])
         if self.has_velocities:
-            ts._velocities[:] = self.trjfile.variables['velocities'][frame]
+            ts._velocities[:] = (self.trjfile.variables['velocities'][frame] *
+                                 self.scale_factors['velocities'])
         if self.has_forces:
-            ts._forces[:] = self.trjfile.variables['forces'][frame]
+            ts._forces[:] = (self.trjfile.variables['forces'][frame] *
+                             self.scale_factors['forces'])
         if self.periodic:
-            ts._unitcell[:3] = self.trjfile.variables['cell_lengths'][frame]
-            ts._unitcell[3:] = self.trjfile.variables['cell_angles'][frame]
+            ts._unitcell[:3] = (self.trjfile.variables['cell_lengths'][frame] *
+                                self.scale_factors['cell_lengths'])
+            ts._unitcell[3:] = (self.trjfile.variables['cell_angles'][frame] *
+                                self.scale_factors['cell_angles'])
         if self.convert_units:
             self.convert_pos_from_native(ts._pos)  # in-place !
             self.convert_time_from_native(
@@ -728,6 +848,10 @@ class NCDFWriter(base.WriterBase):
        Use fast :mod:`netCDF4` for writing but fall back to slow
        :mod:`scipy.io.netcdf` if :mod:`netCDF4` is not available.
 
+    .. TODO:
+       * Change the `cell_angles` units to `degree`, implement `scale_factor`
+         handling (Issue #2327).
+
     """
 
     format = 'NCDF'