From d9a4c9cf3f5b0490b95b47254fa45d70d58d765a Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Wed, 26 Feb 2014 22:48:00 -0800
Subject: [PATCH 1/8] Handle mask_and_scale ourselves instead of using netCDF4

This lets us NaNs instead of masked arrays to indicate missing values.
---
 src/xray/backends.py    |  91 +++------------
 src/xray/common.py      |  10 +-
 src/xray/conventions.py | 253 +++++++++++++++++++++++++++-------------
 src/xray/utils.py       |   2 +-
 src/xray/xarray.py      |  26 +++--
 test/__init__.py        |  11 ++
 test/test_dataset.py    | 105 ++++++++++++-----
 test/test_xarray.py     |   7 +-
 8 files changed, 298 insertions(+), 207 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index 3a67f1affea..2b75db133a0 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -13,7 +13,8 @@
 from collections import OrderedDict
 
 import xarray
-import conventions
+from conventions import (decode_cf_variable, encode_cf_variable,
+                         is_valid_nc3_name, coerce_nc3_dtype)
 from utils import FrozenOrderedDict, Frozen, datetimeindex2num
 
 
@@ -59,36 +60,6 @@ def sync(self):
         pass
 
 
-def convert_to_cf_variable(array):
-    """Converts an XArray into an XArray suitable for saving as a netCDF
-    variable
-    """
-    data = array.data
-    attributes = array.attributes.copy()
-    if isinstance(data, pd.DatetimeIndex):
-        # DatetimeIndex objects need to be encoded into numeric arrays
-        (data, units, calendar) = datetimeindex2num(data)
-        attributes['units'] = units
-        attributes['calendar'] = calendar
-    elif data.dtype == np.dtype('O'):
-        # Unfortunately, pandas.Index arrays often have dtype=object even if
-        # they were created from an array with a sensible datatype (e.g.,
-        # pandas.Float64Index always has dtype=object for some reason). Because
-        # we allow for doing math with coordinates, these object arrays can
-        # propagate onward to other variables, which is why we don't only apply
-        # this check to XArrays with data that is a pandas.Index.
-        dtype = np.array(data.reshape(-1)[0]).dtype
-        # N.B. the "astype" call will fail if data cannot be cast to the type
-        # of its first element (which is probably the only sensible thing to
-        # do).
-        data = np.asarray(data).astype(dtype)
-    return xarray.XArray(array.dimensions, data, attributes)
-
-
-def convert_scipy_variable(var):
-    return xarray.XArray(var.dimensions, var.data, var._attributes)
-
-
 class ScipyDataStore(AbstractDataStore):
     """
     Stores data using the scipy.io.netcdf package.
@@ -101,7 +72,8 @@ def __init__(self, fobj, *args, **kwdargs):
 
     @property
     def variables(self):
-        return FrozenOrderedDict((k, convert_scipy_variable(v))
+        return FrozenOrderedDict((k, decode_cf_variable(v.dimensions, v.data,
+                                                        v._attributes))
                                  for k, v in self.ds.variables.iteritems())
 
     @property
@@ -119,30 +91,16 @@ def set_dimension(self, name, length):
         self.ds.createDimension(name, length)
 
     def _validate_attr_key(self, key):
-        if not conventions.is_valid_name(key):
+        if not is_valid_nc3_name(key):
             raise ValueError("Not a valid attribute name")
 
     def _cast_attr_value(self, value):
-        # Strings get special handling because netCDF treats them as
-        # character arrays. Everything else gets coerced to a numpy
-        # vector. netCDF treats scalars as 1-element vectors. Arrays of
-        # non-numeric type are not allowed.
         if isinstance(value, basestring):
-            # netcdf attributes should be unicode
             value = unicode(value)
         else:
-            try:
-                value = conventions.coerce_type(np.atleast_1d(np.asarray(value)))
-            except:
-                raise ValueError("Not a valid value for a netCDF attribute")
+            value = coerce_nc3_dtype(np.atleast_1d(value))
             if value.ndim > 1:
-                raise ValueError("netCDF attributes must be vectors " +
-                        "(1-dimensional)")
-            value = conventions.coerce_type(value)
-            if str(value.dtype) not in conventions.TYPEMAP:
-                # A plain string attribute is okay, but an array of
-                # string objects is not okay!
-                raise ValueError("Can not convert to a valid netCDF type")
+                raise ValueError("netCDF attributes must be 1-dimensional")
         return value
 
     def set_attribute(self, key, value):
@@ -150,11 +108,8 @@ def set_attribute(self, key, value):
         setattr(self.ds, key, self._cast_attr_value(value))
 
     def set_variable(self, name, variable):
-        variable = convert_to_cf_variable(variable)
-        data = variable.data
-        dtype_convert = {'int64': 'int32', 'float64': 'float32'}
-        if str(data.dtype) in dtype_convert:
-            data = np.asarray(data, dtype=dtype_convert[str(data.dtype)])
+        variable = encode_cf_variable(variable)
+        data = coerce_nc3_dtype(variable.data)
         self.ds.createVariable(name, data.dtype, variable.dimensions)
         scipy_var = self.ds.variables[name]
         scipy_var[:] = data[:]
@@ -169,31 +124,18 @@ def sync(self):
         self.ds.flush()
 
 
-def convert_nc4_variable(var):
-    # we don't want to see scale_factor and add_offset in the attributes
-    # since the netCDF4 package automatically scales the data on read.
-    # If we kept scale_factor and add_offset around and did this:
-    #
-    # foo = ncdf4.Dataset('foo.nc')
-    # ncdf4.dump(foo, 'bar.nc')
-    # bar = ncdf4.Dataset('bar.nc')
-    #
-    # you would find that any packed variables in the original
-    # netcdf file would now have been scaled twice!
-    attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()
-                       if k not in ['scale_factor', 'add_offset'])
-    return xarray.XArray(var.dimensions, var, attr, indexing_mode='orthogonal')
-
-
 class NetCDF4DataStore(AbstractDataStore):
     def __init__(self, filename, *args, **kwdargs):
-        # TODO: set auto_maskandscale=True so we can handle the array
-        # packing/unpacking ourselves (using NaN instead of masked arrays)
         self.ds = nc4.Dataset(filename, *args, **kwdargs)
 
     @property
     def variables(self):
-        return FrozenOrderedDict((k, convert_nc4_variable(v))
+        def convert_variable(var):
+            attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs())
+            var.set_auto_maskandscale(False)
+            return decode_cf_variable(var.dimensions, var, attr,
+                                      indexing_mode='orthogonal')
+        return FrozenOrderedDict((k, convert_variable(v))
                                  for k, v in self.ds.variables.iteritems())
 
     @property
@@ -217,7 +159,7 @@ def _cast_data(self, data):
         return data
 
     def set_variable(self, name, variable):
-        variable = convert_to_cf_variable(variable)
+        variable = encode_cf_variable(variable)
         # netCDF4 will automatically assign a fill value
         # depending on the datatype of the variable.  Here
         # we let the package handle the _FillValue attribute
@@ -228,6 +170,7 @@ def set_variable(self, name, variable):
                                dimensions=variable.dimensions,
                                fill_value=fill_value)
         nc4_var = self.ds.variables[name]
+        nc4_var.set_auto_maskandscale(False)
         nc4_var[:] = variable.data[:]
         nc4_var.setncatts(variable.attributes)
 
diff --git a/src/xray/common.py b/src/xray/common.py
index de4035fef7e..36d7032a122 100644
--- a/src/xray/common.py
+++ b/src/xray/common.py
@@ -35,19 +35,19 @@ def __len__(self):
         return len(self._data)
 
     def __nonzero__(self):
-        return bool(self._data)
+        return bool(self.data)
 
     def __float__(self):
-        return float(self._data)
+        return float(self.data)
 
     def __int__(self):
-        return int(self._data)
+        return int(self.data)
 
     def __complex__(self):
-        return complex(self._data)
+        return complex(self.data)
 
     def __long__(self):
-        return long(self._data)
+        return long(self.data)
 
     # adapted from pandas.NDFrame
     # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L699
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index 637aa5943b4..1d0b022ba44 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -1,41 +1,11 @@
-import numpy as np
 import unicodedata
 
-NULL          = '\x00'
-NC_BYTE       = '\x00\x00\x00\x01'
-NC_CHAR       = '\x00\x00\x00\x02'
-NC_SHORT      = '\x00\x00\x00\x03'
-# netCDF-3 only supports 32-bit integers
-NC_INT        = '\x00\x00\x00\x04'
-NC_FLOAT      = '\x00\x00\x00\x05'
-NC_DOUBLE     = '\x00\x00\x00\x06'
-
-# Map between netCDF type and numpy dtype and vice versa. Due to a bug
-# in the __hash__() method of numpy dtype objects (fixed in development
-# release of numpy), we need to explicitly match byteorder for dict
-# lookups to succeed. Here we normalize to native byte order.
-#
-# NC_CHAR is a special case because netCDF represents strings as
-# character arrays. When NC_CHAR is encountered as the type of an
-# attribute value, this TYPEMAP is not consulted and the data is read
-# as a string. However, when NC_CHAR is encountered as the type of a
-# variable, then the data is read is a numpy array of 1-char elements
-# (equivalently, length-1 raw "strings"). There is no support for numpy
-# arrays of multi-character strings.
-TYPEMAP = {
-        # we could use np.dtype's as key/values except __hash__ comparison of
-        # numpy.dtype is broken in older versions of numpy.  If you must compare
-        # and cannot upgrade, use __eq__.This bug is
-        # known to be fixed in numpy version 1.3
-        NC_BYTE: 'int8',
-        NC_CHAR: '|S1',
-        NC_SHORT: 'int16',
-        NC_INT: 'int32',
-        NC_FLOAT: 'float32',
-        NC_DOUBLE: 'float64',
-        }
-for k in TYPEMAP.keys():
-    TYPEMAP[TYPEMAP[k]] = k
+import numpy as np
+import pandas as pd
+
+import xarray
+import utils
+
 
 # Special characters that are permitted in netCDF names except in the
 # 0th position of the string
@@ -43,61 +13,45 @@
 
 # The following are reserved names in CDL and may not be used as names of
 # variables, dimension, attributes
-_reserved_names = set([
-        'byte',
-        'char',
-        'short',
-        'ushort',
-        'int',
-        'uint',
-        'int64',
-        'uint64',
-        'float'
-        'real',
-        'double',
-        'bool',
-        'string',
-        ])
+_reserved_names = set(['byte', 'char', 'short', 'ushort', 'int', 'uint',
+                       'int64', 'uint64', 'float' 'real', 'double', 'bool',
+                       'string'])
+
 
 def pretty_print(x, numchars):
     """Given an object x, call x.__str__() and format the returned
     string so that it is numchars long, padding with trailing spaces or
     truncating with ellipses as necessary"""
-    s = str(x).rstrip(NULL)
+    s = str(x)
     if len(s) > numchars:
         return s[:(numchars - 3)] + '...'
     else:
         return s
 
-def coerce_type(arr):
-    """Coerce a numeric data type to a type that is compatible with
-    netCDF-3
 
-    netCDF-3 can not handle 64-bit integers, but on most platforms
-    Python integers are int64. To work around this discrepancy, this
-    helper function coerces int64 arrays to int32. An exception is
-    raised if this coercion is not safe.
+def coerce_nc3_dtype(arr):
+    """Coerce an array to a data type that can be stored in a netCDF-3 file
 
-    netCDF-3 can not handle booleans, but booleans can be trivially
-    (albeit wastefully) represented as bytes. To work around this
-    discrepancy, this helper function coerces bool arrays to int8.
+    This function performs the following dtype conversions:
+        int64 -> int32
+        float64 -> float32
+        bool -> int8
+
+    Data is checked for equality, or equivalence with the default values of
+    `np.allclose`.
     """
-    # Comparing the char attributes of numpy dtypes is inelegant, but this is
-    # the fastest test of equivalence that is invariant to endianness
-    if arr.dtype.char == 'l': # np.dtype('int64')
-        cast_arr = arr.astype(
-                np.dtype('int32').newbyteorder(arr.dtype.byteorder))
-        if not (cast_arr == arr).all():
-            raise ValueError("array contains integer values that " +
-                    "are not representable as 32-bit signed integers")
-        return cast_arr
-    elif arr.dtype.char == '?': # np.dtype('bool')
-        # bool
-        cast_arr = arr.astype(
-                np.dtype('int8').newbyteorder(arr.dtype.byteorder))
-        return cast_arr
-    else:
-        return arr
+    dtype = str(arr.dtype)
+    dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'}
+    if dtype in dtype_map:
+        new_dtype = dtype_map[dtype]
+        cast_arr = arr.astype(new_dtype)
+        if (('int' in dtype and not (cast_arr == arr).all())
+                or ('float' in dtype and not np.allclose(cast_arr, arr))):
+            raise ValueError('could not safely cast array from dtype %s to %s'
+                             % (dtype, new_dtype))
+        arr = cast_arr
+    return arr
+
 
 def _isalnumMUTF8(c):
     """Return True if the given UTF-8 encoded character is alphanumeric
@@ -105,10 +59,11 @@ def _isalnumMUTF8(c):
 
     Input is not checked!
     """
-    return (c.isalnum() or (len(c.encode('utf-8')) > 1))
+    return c.isalnum() or (len(c.encode('utf-8')) > 1)
+
 
-def is_valid_name(s):
-    """Test whether an object can be validly converted to a netCDF
+def is_valid_nc3_name(s):
+    """Test whether an object can be validly converted to a netCDF-3
     dimension, variable or attribute name
 
     Earlier versions of the netCDF C-library reference implementation
@@ -135,5 +90,137 @@ def is_valid_name(s):
             ('/' not in s) and
             (s[-1] != ' ') and
             (_isalnumMUTF8(s[0]) or (s[0] == '_')) and
-            all((_isalnumMUTF8(c) or c in _specialchars for c in s))
-            )
\ No newline at end of file
+            all((_isalnumMUTF8(c) or c in _specialchars for c in s)))
+
+
+class MaskedAndScaledArray(object):
+    """Wrapper around array-like objects to create a new indexable object where
+    values, when accessesed, are automatically scaled and masked according to
+    CF conventions for packed and missing data values
+
+    New values are given by the formula:
+        original_values * scale_factor + add_offset
+
+    Values can only be accessed via `__getitem__`:
+
+    >>> x = _MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1)
+    >>> x
+    _MaskedAndScaledArray(array([-99, -1,  0,  1,  2]), fill_value=-99,
+    scale_factor=0.01, add_offset=1)
+    >>> x[:]
+    array([  nan,  0.99,  1.  ,  1.01,  1.02]
+
+    References
+    ----------
+    http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html
+    """
+    def __init__(self, array, fill_value=None, scale_factor=None,
+                 add_offset=None):
+        """
+        Parameters
+        ----------
+        array : array-like
+            Original array of values to wrap
+        fill_value : number, optional
+            All values equal to fill_value in the original array are replaced
+            by NaN.
+        scale_factor : number, optional
+            Multiply entries in the original array by this number.
+        add_offset : number, optional
+            After applying scale_factor, add this number to entries in the
+            original array.
+        """
+        self.array = array
+        self.scale_factor = scale_factor
+        self.add_offset = add_offset
+        self.fill_value = fill_value
+
+    @property
+    def dtype(self):
+        return np.dtype('float')
+
+    @property
+    def shape(self):
+        return self.array.shape
+
+    @property
+    def size(self):
+        return self.array.size
+
+    @property
+    def ndim(self):
+        return self.array.ndim
+
+    def __len__(self):
+        return len(self.array)
+
+    def __array__(self):
+        return self[...]
+
+    def __getitem__(self, key):
+        # cast to float to insure NaN is meaningful
+        values = np.array(self.array[key], dtype=float, copy=True)
+        if self.fill_value is not None:
+            values[values == self.fill_value] = np.nan
+        if self.scale_factor is not None:
+            values *= self.scale_factor
+        if self.add_offset is not None:
+            values += self.add_offset
+        return values
+
+    def __repr__(self):
+        return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r)" %
+                (type(self).__name__, self.array, self.fill_value,
+                 self.scale_factor, self.add_offset))
+
+
+def encode_cf_variable(array):
+    """Converts an XArray into an XArray suitable for saving as a netCDF
+    variable
+    """
+    data = array.data
+    attributes = array.attributes.copy()
+    if isinstance(data, pd.DatetimeIndex):
+        # DatetimeIndex objects need to be encoded into numeric arrays
+        (data, units, calendar) = utils.datetimeindex2num(data)
+        attributes['units'] = units
+        attributes['calendar'] = calendar
+    elif data.dtype == np.dtype('O'):
+        # Unfortunately, pandas.Index arrays often have dtype=object even if
+        # they were created from an array with a sensible datatype (e.g.,
+        # pandas.Float64Index always has dtype=object for some reason). Because
+        # we allow for doing math with coordinates, these object arrays can
+        # propagate onward to other variables, which is why we don't only apply
+        # this check to XArrays with data that is a pandas.Index.
+        dtype = np.array(data.reshape(-1)[0]).dtype
+        # N.B. the "astype" call will fail if data cannot be cast to the type
+        # of its first element (which is probably the only sensible thing to
+        # do).
+        data = np.asarray(data).astype(dtype)
+
+    # unscale/mask
+    if any(k in attributes for k in ['add_offset', 'scale_factor']):
+        data = np.array(data, dtype=float, copy=True)
+        if 'add_offset' in attributes:
+            data -= attributes['add_offset']
+        if 'scale_factor' in attributes:
+            data /= attributes['scale_factor']
+
+    # restore original dtype
+    if 'encoded_dtype' in attributes:
+        data = data.astype(attributes.pop('encoded_dtype'))
+
+    return xarray.XArray(array.dimensions, data, attributes)
+
+
+def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy'):
+    attributes = attributes.copy()
+    attributes['encoded_dtype'] = data.dtype
+
+    mask_and_scale_attrs = ['_FillValue', 'scale_factor', 'add_offset']
+    if any(k in attributes for k in mask_and_scale_attrs):
+        data = MaskedAndScaledArray(data, attributes.get('_FillValue'),
+                                    attributes.get('scale_factor'),
+                                    attributes.get('add_offset'))
+
+    return xarray.XArray(dimensions, data, attributes, indexing_mode)
diff --git a/src/xray/utils.py b/src/xray/utils.py
index d2a3f7c15b8..b2b0f290257 100644
--- a/src/xray/utils.py
+++ b/src/xray/utils.py
@@ -169,7 +169,7 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08):
     """True if two objects have the same dimensions, attributes and data;
     otherwise False.
 
-    This function is necessary because `v1 == v2` for variables and dataviews
+    This function is necessary because `v1 == v2` for XArrays and DatasetArrays
     does element-wise comparisions (like numpy.ndarrays).
     """
     if (v1.dimensions == v2.dimensions
diff --git a/src/xray/xarray.py b/src/xray/xarray.py
index f4c3fa718c0..e06a5a8e1c4 100644
--- a/src/xray/xarray.py
+++ b/src/xray/xarray.py
@@ -324,7 +324,8 @@ def reduce(self, func, dimension=None, axis=None, **kwargs):
             for dim in dimension:
                 var = var._reduce(func, dim, **kwargs)
         else:
-            var = type(self)([], func(self.data, **kwargs), self.attributes)
+            var = type(self)([], func(self.data, **kwargs),
+                             self._math_safe_attributes())
             var._append_to_cell_methods(': '.join(self.dimensions)
                                         + ': ' + func.__name__)
         return var
@@ -342,7 +343,7 @@ def _reduce(self, f, dim, **kwargs):
         dims = tuple(dim for i, dim in enumerate(self.dimensions)
                      if axis not in [i, i - self.ndim])
         data = f(self.data, axis=axis, **kwargs)
-        new_var = type(self)(dims, data, self.attributes)
+        new_var = type(self)(dims, data, self._math_safe_attributes())
         new_var._append_to_cell_methods(self.dimensions[axis]
                                         + ': ' + f.__name__)
         return new_var
@@ -468,12 +469,16 @@ def from_stack(cls, variables, dimension='stacked_dimension',
     def __array_wrap__(self, obj, context=None):
         return type(self)(self.dimensions, obj, self.attributes)
 
+    def _math_safe_attributes(self):
+        return OrderedDict((k, v) for k, v in self.attributes.iteritems()
+                           if k not in ['units', 'encoded_dtype'])
+
     @staticmethod
     def _unary_op(f):
         @functools.wraps(f)
         def func(self, *args, **kwargs):
             return type(self)(self.dimensions, f(self.data, *args, **kwargs),
-                              self.attributes)
+                              self._math_safe_attributes())
         return func
 
     @staticmethod
@@ -486,11 +491,11 @@ def func(self, other):
             new_data = (f(self_data, other_data)
                         if not reflexive
                         else f(other_data, self_data))
-            if hasattr(other, 'attributes'):
-                new_attr = utils.ordered_dict_intersection(self.attributes,
-                                                           other.attributes)
-            else:
-                new_attr = self.attributes
+            new_attr = self._math_safe_attributes()
+            # TODO: reconsider handling of conflicting attributes
+            if hasattr(other, '_math_safe_attributes'):
+                new_attr = utils.ordered_dict_intersection(
+                    new_attr, other._math_safe_attributes())
             return type(self)(dims, new_data, new_attr)
         return func
 
@@ -503,8 +508,9 @@ def func(self, other):
                 raise ValueError('dimensions cannot change for in-place '
                                  'operations')
             self.data = f(self_data, other_data)
-            if hasattr(other, 'attributes'):
-                utils.remove_incompatible_items(self.attributes, other)
+            if hasattr(other, '_math_safe_attributes'):
+                utils.remove_incompatible_items(
+                    self.attributes, other._math_safe_attributes())
             return self
         return func
 
diff --git a/test/__init__.py b/test/__init__.py
index b4133911543..0195431894f 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -15,6 +15,17 @@ def assertXArrayNotEqual(self, v1, v2):
     def assertArrayEqual(self, a1, a2):
         assert_array_equal(a1, a2)
 
+    def assertDatasetEqual(self, d1, d2):
+        # this method is functionally equivalent to `assert d1 == d2`, but it
+        # checks each aspect of equality separately for easier debugging
+        self.assertEqual(sorted(d1.attributes.items()),
+                         sorted(d2.attributes.items()))
+        self.assertEqual(sorted(d1.variables), sorted(d2.variables))
+        for k in d1:
+            v1 = d1.variables[k]
+            v2 = d2.variables[k]
+            self.assertXArrayEqual(v1, v2)
+
 
 class ReturnItem(object):
     def __getitem__(self, key):
diff --git a/test/test_dataset.py b/test/test_dataset.py
index b1f63ca462b..fff724df923 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -1,10 +1,10 @@
 from collections import OrderedDict
-from copy import deepcopy
 from cStringIO import StringIO
 import os.path
 import unittest
 import tempfile
 
+import netCDF4 as nc4
 import numpy as np
 import pandas as pd
 
@@ -21,8 +21,8 @@
 _testvar = sorted(_vars.keys())[0]
 _testdim = sorted(_dims.keys())[0]
 
-def create_test_data(store=None):
-    obj = Dataset() if store is None else Dataset.load_store(store)
+def create_test_data():
+    obj = Dataset()
     obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000))
     for k, d in sorted(_dims.items()):
         obj[k] = (k, np.arange(d))
@@ -37,7 +37,7 @@ def get_store(self):
         return backends.InMemoryDataStore()
 
     def test_repr(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         self.assertEqual('<xray.Dataset (time: 1000, dim1: 100, '
                          'dim2: 50, dim3: 10): var1 var2 var3>', repr(data))
 
@@ -51,7 +51,7 @@ def test_init(self):
             Dataset({'a': var1, 'x': var3})
 
     def test_groupby(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]):
             self.assertEqual(data['dim1'][n], t)
             self.assertXArrayEqual(data['var1'][n], sub['var1'])
@@ -142,7 +142,7 @@ def test_attributes(self):
         self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict())
 
     def test_indexed_by(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)}
         ret = data.indexed_by(**slicers)
 
@@ -181,7 +181,7 @@ def test_indexed_by(self):
         self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions)
 
     def test_labeled_by(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)}
         loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)}
         self.assertEqual(data.indexed_by(**int_slicers),
@@ -199,7 +199,7 @@ def test_labeled_by(self):
                             time=pd.date_range('2000-01-01', periods=3)))
 
     def test_variable_indexing(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         v = data['var1']
         d1 = data['dim1']
         d2 = data['dim2']
@@ -212,7 +212,7 @@ def test_variable_indexing(self):
         self.assertXArrayEqual(v[:3, :2], v[range(3), range(2)])
 
     def test_select(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         ret = data.select(_testvar)
         self.assertXArrayEqual(data[_testvar], ret[_testvar])
         self.assertTrue(_vars.keys()[1] not in ret.variables)
@@ -223,7 +223,7 @@ def test_unselect(self):
         pass
 
     def test_copy(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         var = data.variables[_testvar]
         var.attributes['foo'] = 'hello world'
         var_copy = var.__deepcopy__()
@@ -239,7 +239,7 @@ def test_copy(self):
         self.assertNotEqual(id(var.attributes), id(var_copy.attributes))
 
     def test_rename(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         newnames = {'var1': 'renamed_var1', 'dim2': 'renamed_dim2'}
         renamed = data.renamed(newnames)
 
@@ -264,7 +264,7 @@ def test_rename(self):
         self.assertTrue('dim2' not in renamed.dimensions)
 
     def test_merge(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         ds1 = data.select('var1')
         ds2 = data.select('var3')
         expected = data.select('var1', 'var3')
@@ -276,7 +276,7 @@ def test_merge(self):
             ds1.merge(ds2.renamed({'var3': 'var1'}))
 
     def test_getitem(self):
-        data = create_test_data(self.get_store())
+        data = create_test_data()
         data['time'] = ('time', np.arange(1000, dtype=np.int32),
                         {'units': 'days since 2000-01-01'})
         self.assertIsInstance(data['var1'], DatasetArray)
@@ -292,7 +292,7 @@ def test_getitem(self):
     def test_setitem(self):
         # assign a variable
         var = XArray(['dim1'], np.random.randn(100))
-        data1 = create_test_data(self.get_store())
+        data1 = create_test_data()
         data1['A'] = var
         data2 = data1.copy()
         data2['A'] = var
@@ -311,7 +311,7 @@ def test_write_store(self):
         store = self.get_store()
         expected.dump_to_store(store)
         actual = Dataset.load_store(store)
-        self.assertEquals(expected, actual)
+        self.assertDatasetEqual(expected, actual)
 
     def test_to_dataframe(self):
         x = np.random.randn(10)
@@ -337,27 +337,68 @@ def test_to_dataframe(self):
         self.assertTrue(expected.equals(actual))
 
 
+def create_masked_and_scaled_data():
+    x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
+    attr = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1,
+            'encoded_dtype': np.int16}
+    return Dataset({'x': ('t', x, attr)})
+
+
 class NetCDF4DataTest(DataTest):
     def get_store(self):
         f, self.tmp_file = tempfile.mkstemp(suffix='.nc')
         os.close(f)
         return backends.NetCDF4DataStore(self.tmp_file, mode='w')
 
+    def tearDown(self):
+        if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file):
+            os.remove(self.tmp_file)
+
     def test_dump_and_open_dataset(self):
-        data = create_test_data(self.get_store())
+        for data in [create_test_data(), create_masked_and_scaled_data()]:
+            f, tmp_file = tempfile.mkstemp(suffix='.nc')
+            os.close(f)
+            data.dump(tmp_file)
+
+            expected = data.copy()
+            actual = open_dataset(tmp_file)
+            self.assertDatasetEqual(expected, actual)
+            os.remove(tmp_file)
+
+    def test_mask_and_scale(self):
         f, tmp_file = tempfile.mkstemp(suffix='.nc')
         os.close(f)
-        data.dump(tmp_file)
 
-        expected = data.copy()
-        actual = open_dataset(tmp_file)
-        self.assertEquals(expected, actual)
+        nc = nc4.Dataset(tmp_file, mode='w')
+        nc.createDimension('t', 5)
+        nc.createVariable('x', 'int16', ('t',), fill_value=-1)
+        v = nc.variables['x']
+        v.set_auto_maskandscale(False)
+        v.add_offset = 10
+        v.scale_factor = 0.1
+        v[:] = np.array([-1, -1, 0, 1, 2])
+        nc.close()
+
+        # first make sure netCDF4 reads the masked and scaled data correctly
+        nc = nc4.Dataset(tmp_file, mode='r')
+        expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
+                               mask=[True, True, False, False, False])
+        actual = nc.variables['x'][:]
+        self.assertArrayEqual(expected, actual)
+
+        def replace_nan(x):
+            # replace NaN with a sentinel value because xarray_equal is not
+            # NaN safe
+            x[np.isnan(x)] = -9999
+            return x
+
+        # now check xray
+        ds = open_dataset(tmp_file)
+        expected = create_masked_and_scaled_data()
+        self.assertDatasetEqual(replace_nan(expected['x']).dataset,
+                                replace_nan(ds['x']).dataset)
         os.remove(tmp_file)
 
-    def tearDown(self):
-        if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file):
-            os.remove(self.tmp_file)
-
 
 class ScipyDataTest(DataTest):
     def get_store(self):
@@ -365,12 +406,11 @@ def get_store(self):
         return backends.ScipyDataStore(fobj, 'w')
 
     def test_dump_and_open_dataset(self):
-        data = create_test_data(self.get_store())
-        serialized = data.dumps()
-
-        expected = data.copy()
-        actual = open_dataset(StringIO(serialized))
-        self.assertEquals(expected, actual)
+        for data in [create_test_data(), create_masked_and_scaled_data()]:
+            serialized = data.dumps()
+            expected = data.copy()
+            actual = open_dataset(StringIO(serialized))
+            self.assertDatasetEqual(expected, actual)
 
     def test_open_and_reopen_existing(self):
         data = open_dataset(os.path.join(_test_data_path, 'example_1.nc'))
@@ -378,7 +418,10 @@ def test_open_and_reopen_existing(self):
 
         expected = data.copy()
         actual = open_dataset(StringIO(serialized))
-        self.assertEquals(expected, actual)
+        # TODO: remove these print statements:
+        print data
+        print actual
+        self.assertDatasetEqual(expected, actual)
 
     def test_repr(self):
         # scipy.io.netcdf does not keep track of dimension order :(
diff --git a/test/test_xarray.py b/test/test_xarray.py
index f91642e6404..f5323db3a74 100644
--- a/test/test_xarray.py
+++ b/test/test_xarray.py
@@ -123,10 +123,11 @@ def test_1d_math(self):
         self.assertArrayEqual((x * v).data, x ** 2)
         self.assertArrayEqual(v - y, v - 1)
         self.assertArrayEqual(y - v, 1 - v)
-        # verify attributes
+        # verify math-safe attributes
         v2 = XArray(['x'], x, {'units': 'meters'})
-        self.assertXArrayEqual(v2, +v2)
-        self.assertXArrayEqual(v2, 0 + v2)
+        self.assertXArrayEqual(v, +v2)
+        v3 = XArray(['x'], x, {'something': 'else'})
+        self.assertXArrayEqual(v3, +v3)
         # binary ops with all variables
         self.assertArrayEqual(v + v, 2 * v)
         w = XArray(['x'], y, {'foo': 'bar'})

From 75bce5bb9c31e38e3599f3200e839832a90df521 Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Thu, 27 Feb 2014 12:11:58 -0800
Subject: [PATCH 2/8] Save serialization info to XArray.encoding

---
 src/xray/backends.py     | 27 +++++++++--------
 src/xray/conventions.py  | 54 ++++++++++++++++++++-------------
 src/xray/utils.py        | 30 +++++++++++++++----
 src/xray/xarray.py       | 65 ++++++++++++++++++++++++----------------
 test/test_conventions.py | 25 ++++++++++++++++
 test/test_dataset.py     |  9 ++----
 6 files changed, 142 insertions(+), 68 deletions(-)
 create mode 100644 test/test_conventions.py

diff --git a/src/xray/backends.py b/src/xray/backends.py
index 2b75db133a0..fa2bb1ac897 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -67,8 +67,11 @@ class ScipyDataStore(AbstractDataStore):
     be initialized with a StringIO object, allow for
     serialization.
     """
-    def __init__(self, fobj, *args, **kwdargs):
-        self.ds = netcdf.netcdf_file(fobj, *args, **kwdargs)
+    def __init__(self, filename_or_obj, mode='r', mmap=None, version=1,
+                 decode_mask_and_scale=True):
+        self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None,
+                                     version=version)
+        self.decode_mask_and_scale = decode_mask_and_scale
 
     @property
     def variables(self):
@@ -125,16 +128,21 @@ def sync(self):
 
 
 class NetCDF4DataStore(AbstractDataStore):
-    def __init__(self, filename, *args, **kwdargs):
-        self.ds = nc4.Dataset(filename, *args, **kwdargs)
+    def __init__(self, filename, mode='r', clobber=True, diskless=False,
+                 persist=False, format='NETCDF4', decode_mask_and_scale=True):
+        self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
+                              diskless=diskless, persist=persist,
+                              format=format)
+        self.decode_mask_and_scale = decode_mask_and_scale
 
     @property
     def variables(self):
         def convert_variable(var):
             attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs())
             var.set_auto_maskandscale(False)
-            return decode_cf_variable(var.dimensions, var, attr,
-                                      indexing_mode='orthogonal')
+            return decode_cf_variable(
+                var.dimensions, var, attr, indexing_mode='orthogonal',
+                decode_mask_and_scale=self.decode_mask_and_scale)
         return FrozenOrderedDict((k, convert_variable(v))
                                  for k, v in self.ds.variables.iteritems())
 
@@ -153,18 +161,13 @@ def set_dimension(self, name, length):
     def set_attribute(self, key, value):
         self.ds.setncatts({key: value})
 
-    def _cast_data(self, data):
-        if isinstance(data, pd.DatetimeIndex):
-            data = datetimeindex2num(data)
-        return data
-
     def set_variable(self, name, variable):
         variable = encode_cf_variable(variable)
         # netCDF4 will automatically assign a fill value
         # depending on the datatype of the variable.  Here
         # we let the package handle the _FillValue attribute
         # instead of setting it ourselves.
-        fill_value = variable.attributes.pop('_FillValue', None)
+        fill_value = variable.encoding.get('_FillValue')
         self.ds.createVariable(varname=name,
                                datatype=variable.dtype,
                                dimensions=variable.dimensions,
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index 1d0b022ba44..d14ca39fa93 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -37,16 +37,18 @@ def coerce_nc3_dtype(arr):
         float64 -> float32
         bool -> int8
 
-    Data is checked for equality, or equivalence with the default values of
-    `np.allclose`.
+    Data is checked for equality, or equivalence (non-NaN values) with
+    `np.allclose` with the default keyword arguments.
     """
     dtype = str(arr.dtype)
     dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'}
     if dtype in dtype_map:
         new_dtype = dtype_map[dtype]
+        # TODO: raise a warning whenever casting the data-type instead?
         cast_arr = arr.astype(new_dtype)
         if (('int' in dtype and not (cast_arr == arr).all())
-                or ('float' in dtype and not np.allclose(cast_arr, arr))):
+                or ('float' in dtype and
+                    not utils.allclose_or_equiv(cast_arr, arr))):
             raise ValueError('could not safely cast array from dtype %s to %s'
                              % (dtype, new_dtype))
         arr = cast_arr
@@ -199,28 +201,40 @@ def encode_cf_variable(array):
         data = np.asarray(data).astype(dtype)
 
     # unscale/mask
-    if any(k in attributes for k in ['add_offset', 'scale_factor']):
+    encoding = array.encoding
+    if any(k in encoding for k in ['add_offset', 'scale_factor']):
         data = np.array(data, dtype=float, copy=True)
-        if 'add_offset' in attributes:
-            data -= attributes['add_offset']
-        if 'scale_factor' in attributes:
-            data /= attributes['scale_factor']
+        if 'add_offset' in encoding:
+            data -= encoding['add_offset']
+            attributes['add_offset'] = encoding['add_offset']
+        if 'scale_factor' in encoding:
+            data /= encoding['scale_factor']
+            attributes['add_offset'] = encoding['add_offset']
 
     # restore original dtype
-    if 'encoded_dtype' in attributes:
-        data = data.astype(attributes.pop('encoded_dtype'))
+    if 'dtype' in encoding:
+        data = data.astype(encoding['dtype'])
 
-    return xarray.XArray(array.dimensions, data, attributes)
+    return xarray.XArray(array.dimensions, data, attributes, encoding)
 
 
-def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy'):
-    attributes = attributes.copy()
-    attributes['encoded_dtype'] = data.dtype
+def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy',
+                       decode_mask_and_scale=True):
+    def pop_to(k, source, dest):
+        v = source.pop(k, None)
+        if v is not None:
+            dest[k] = v
+        return v
 
-    mask_and_scale_attrs = ['_FillValue', 'scale_factor', 'add_offset']
-    if any(k in attributes for k in mask_and_scale_attrs):
-        data = MaskedAndScaledArray(data, attributes.get('_FillValue'),
-                                    attributes.get('scale_factor'),
-                                    attributes.get('add_offset'))
+    encoding = {'dtype': data.dtype}
 
-    return xarray.XArray(dimensions, data, attributes, indexing_mode)
+    if decode_mask_and_scale:
+        fill_value = pop_to('_FillValue', attributes, encoding)
+        scale_factor = pop_to('scale_factor', attributes, encoding)
+        add_offset = pop_to('add_offset', attributes, encoding)
+        if (fill_value is not None or scale_factor is not None
+                or add_offset is not None):
+            data = MaskedAndScaledArray(data, fill_value, scale_factor,
+                                        add_offset)
+
+    return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding)
diff --git a/src/xray/utils.py b/src/xray/utils.py
index b2b0f290257..0e46c5a5830 100644
--- a/src/xray/utils.py
+++ b/src/xray/utils.py
@@ -165,6 +165,24 @@ def datetimeindex2num(dates, units=None, calendar=None):
     return (num, units, calendar)
 
 
+def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
+    """Like np.allclose, but also allows values to NaN in both arrays
+    """
+    if arr1.shape != arr2.shape:
+        return False
+    nan_indices = np.isnan(arr1)
+    if not (nan_indices == np.isnan(arr2)).all():
+        return False
+    else:
+        if arr1.ndim > 0:
+            arr1 = arr1[~nan_indices]
+            arr2 = arr2[~nan_indices]
+        elif nan_indices:
+            # 0-d arrays can't be indexed, so just check if the value is NaN
+            return True
+        return np.allclose(arr1, arr2, rtol=rtol, atol=atol)
+
+
 def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08):
     """True if two objects have the same dimensions, attributes and data;
     otherwise False.
@@ -182,17 +200,19 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08):
             # _data is not part of the public interface, so it's okay if its
             # missing
             pass
-        # TODO: replace this with a NaN safe version.
-        # see: pandas.core.common.array_equivalent
+
+        def is_floating(arr):
+            return np.issubdtype(arr.dtype, float)
+
         data1 = v1.data
         data2 = v2.data
         if hasattr(data1, 'equals'):
             # handle pandas.Index objects
             return data1.equals(data2)
-        elif np.issubdtype(data1.dtype, (str, object)):
-            return np.array_equal(data1, data2)
+        elif is_floating(data1) or is_floating(data2):
+            return allclose_or_equiv(data1, data2)
         else:
-            return np.allclose(data1, data2, rtol=rtol, atol=atol)
+            return np.array_equal(data1, data2)
     else:
         return False
 
diff --git a/src/xray/xarray.py b/src/xray/xarray.py
index e06a5a8e1c4..693d0ae2420 100644
--- a/src/xray/xarray.py
+++ b/src/xray/xarray.py
@@ -38,7 +38,8 @@ class XArray(AbstractArray):
     outside the context of its parent Dataset (if you want such a fully
     described object, use a DatasetArray instead).
     """
-    def __init__(self, dims, data, attributes=None, indexing_mode='numpy'):
+    def __init__(self, dims, data, attributes=None, indexing_mode='numpy',
+                 encoding=None):
         """
         Parameters
         ----------
@@ -56,9 +57,15 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'):
             (with arrays). Two modes are supported: 'numpy' (fancy indexing
             like numpy.ndarray objects) and 'orthogonal' (array indexing
             accesses different dimensions independently, like netCDF4
-            variables). Accessing data from a Array always uses orthogonal
+            variables). Accessing data from an XArray always uses orthogonal
             indexing, so `indexing_mode` tells the variable whether index
             lookups need to be internally converted to numpy-style indexing.
+        encoding : dict_like or None, optional
+            Dictionary specifying how to encode this array's data into a
+            serialized format like netCDF4. Currently used keys (for netCDF)
+            include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'.
+            Well behaviored code to serialize an XArray should ignore
+            unrecognized keys in this dictionary.
         """
         if isinstance(dims, basestring):
             dims = (dims,)
@@ -71,6 +78,7 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'):
             attributes = {}
         self._attributes = OrderedDict(attributes)
         self._indexing_mode = indexing_mode
+        self.encoding = dict({} if encoding is None else encoding)
 
     @property
     def data(self):
@@ -149,7 +157,8 @@ def __getitem__(self, key):
         # return a variable with the same indexing_mode, because data should
         # still be the same type as _data
         return type(self)(dimensions, new_data, self.attributes,
-                          indexing_mode=self._indexing_mode)
+                          indexing_mode=self._indexing_mode,
+                          encoding=self.encoding)
 
     def __setitem__(self, key, value):
         """__setitem__ is overloaded to access the underlying numpy data with
@@ -181,7 +190,8 @@ def _copy(self, deepcopy=False):
         # note:
         # dimensions is already an immutable tuple
         # attributes will be copied when the new Array is created
-        return type(self)(self.dimensions, data, self.attributes)
+        return type(self)(self.dimensions, data, self.attributes,
+                          encoding=self.encoding)
 
     def __copy__(self):
         return self._copy(deepcopy=False)
@@ -274,7 +284,8 @@ def transpose(self, *dimensions):
             dimensions = self.dimensions[::-1]
         axes = [self.dimensions.index(dim) for dim in dimensions]
         data = self.data.transpose(*axes)
-        return type(self)(dimensions, data, self.attributes)
+        return type(self)(dimensions, data, self.attributes,
+                          encoding=self.encoding)
 
     def reduce(self, func, dimension=None, axis=None, **kwargs):
         """Reduce this array by applying `func` along some dimension(s).
@@ -325,7 +336,7 @@ def reduce(self, func, dimension=None, axis=None, **kwargs):
                 var = var._reduce(func, dim, **kwargs)
         else:
             var = type(self)([], func(self.data, **kwargs),
-                             self._math_safe_attributes())
+                             _math_safe_attributes(self.attributes))
             var._append_to_cell_methods(': '.join(self.dimensions)
                                         + ': ' + func.__name__)
         return var
@@ -343,7 +354,8 @@ def _reduce(self, f, dim, **kwargs):
         dims = tuple(dim for i, dim in enumerate(self.dimensions)
                      if axis not in [i, i - self.ndim])
         data = f(self.data, axis=axis, **kwargs)
-        new_var = type(self)(dims, data, self._math_safe_attributes())
+        new_var = type(self)(dims, data,
+                             _math_safe_attributes(self.attributes))
         new_var._append_to_cell_methods(self.dimensions[axis]
                                         + ': ' + f.__name__)
         return new_var
@@ -469,16 +481,12 @@ def from_stack(cls, variables, dimension='stacked_dimension',
     def __array_wrap__(self, obj, context=None):
         return type(self)(self.dimensions, obj, self.attributes)
 
-    def _math_safe_attributes(self):
-        return OrderedDict((k, v) for k, v in self.attributes.iteritems()
-                           if k not in ['units', 'encoded_dtype'])
-
     @staticmethod
     def _unary_op(f):
         @functools.wraps(f)
         def func(self, *args, **kwargs):
             return type(self)(self.dimensions, f(self.data, *args, **kwargs),
-                              self._math_safe_attributes())
+                              _math_safe_attributes(self.attributes))
         return func
 
     @staticmethod
@@ -491,11 +499,11 @@ def func(self, other):
             new_data = (f(self_data, other_data)
                         if not reflexive
                         else f(other_data, self_data))
-            new_attr = self._math_safe_attributes()
+            new_attr = _math_safe_attributes(self.attributes)
             # TODO: reconsider handling of conflicting attributes
-            if hasattr(other, '_math_safe_attributes'):
+            if hasattr(other, 'attributes'):
                 new_attr = utils.ordered_dict_intersection(
-                    new_attr, other._math_safe_attributes())
+                    new_attr, _math_safe_attributes(other.attributes))
             return type(self)(dims, new_data, new_attr)
         return func
 
@@ -508,27 +516,32 @@ def func(self, other):
                 raise ValueError('dimensions cannot change for in-place '
                                  'operations')
             self.data = f(self_data, other_data)
-            if hasattr(other, '_math_safe_attributes'):
+            if hasattr(other, 'attributes'):
                 utils.remove_incompatible_items(
-                    self.attributes, other._math_safe_attributes())
+                    self.attributes, _math_safe_attributes(other.attributes))
             return self
         return func
 
 ops.inject_special_operations(XArray)
 
 
+def _math_safe_attributes(attributes):
+    return OrderedDict((k, v) for k, v in attributes.iteritems()
+                       if k not in ['units'])
+
+
 def broadcast_xarrays(first, second):
     """Given two XArrays, return two AXrrays with matching dimensions and numpy
     broadcast compatible data.
 
     Parameters
     ----------
-    first, second : Array
-        Array objects to broadcast.
+    first, second : XArray
+        XArray objects to broadcast.
 
     Returns
     -------
-    first_broadcast, second_broadcast : Array
+    first_broadcast, second_broadcast : XArray
         Broadcast arrays. The data on each variable will be a view of the
         data on the corresponding original arrays, but dimensions will be
         reordered and inserted so that both broadcast arrays have the same
@@ -558,21 +571,23 @@ def broadcast_xarrays(first, second):
     # expand first_data's dimensions so it's broadcast compatible after
     # adding second's dimensions at the end
     first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)]
-    new_first = XArray(dimensions, first_data, first.attributes)
+    new_first = XArray(dimensions, first_data, first.attributes,
+                       encoding=first.encoding)
     # expand and reorder second_data so the dimensions line up
     first_only_dims = [d for d in dimensions if d not in second.dimensions]
     second_dims = list(second.dimensions) + first_only_dims
     second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)]
-    new_second = XArray(second_dims, second_data, first.attributes
-        ).transpose(*dimensions)
+    new_second = XArray(second_dims, second_data, first.attributes,
+                        encoding=second.encoding).transpose(*dimensions)
     return new_first, new_second
 
 
 def _broadcast_xarray_data(self, other):
     if isinstance(other, dataset.Dataset):
         raise TypeError('datasets do not support mathematical operations')
-    elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']):
-        # `other` satisfies the xray.Array API
+    elif all(hasattr(other, attr) for attr
+             in ['dimensions', 'data', 'shape', 'encoding']):
+        # `other` satisfies the necessary xray.Array API for broadcast_xarrays
         new_self, new_other = broadcast_xarrays(self, other)
         self_data = new_self.data
         other_data = new_other.data
diff --git a/test/test_conventions.py b/test/test_conventions.py
new file mode 100644
index 00000000000..58daf09eae5
--- /dev/null
+++ b/test/test_conventions.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+from xray.conventions import MaskedAndScaledArray
+from . import TestCase
+
+
+class TestMaskedAndScaledArray(TestCase):
+    def test(self):
+        x = MaskedAndScaledArray(np.arange(3), fill_value=0)
+        self.assertEqual(x.dtype, np.dtype('float'))
+        self.assertEqual(x.shape, (3,))
+        self.assertEqual(x.size, 3)
+        self.assertEqual(x.ndim, 1)
+        self.assertEqual(len(x), 3)
+        self.assertArrayEqual([np.nan, 1, 2], x)
+
+        x = MaskedAndScaledArray(np.arange(3), add_offset=1)
+        self.assertArrayEqual(np.arange(3) + 1, x)
+
+        x = MaskedAndScaledArray(np.arange(3), scale_factor=2)
+        self.assertArrayEqual(2 * np.arange(3), x)
+
+        x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1)
+        expected = np.array([np.nan, 0.99, 1, 1.01, 1.02])
+        self.assertArrayEqual(expected, x)
diff --git a/test/test_dataset.py b/test/test_dataset.py
index fff724df923..28029a83590 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -339,9 +339,9 @@ def test_to_dataframe(self):
 
 def create_masked_and_scaled_data():
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
-    attr = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1,
-            'encoded_dtype': np.int16}
-    return Dataset({'x': ('t', x, attr)})
+    encoding = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1,
+                'dtype': np.int16}
+    return Dataset({'x': ('t', x, {}, encoding)})
 
 
 class NetCDF4DataTest(DataTest):
@@ -418,9 +418,6 @@ def test_open_and_reopen_existing(self):
 
         expected = data.copy()
         actual = open_dataset(StringIO(serialized))
-        # TODO: remove these print statements:
-        print data
-        print actual
         self.assertDatasetEqual(expected, actual)
 
     def test_repr(self):

From 8cc82580e83762d42c48a32bd378daf472df528b Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Thu, 27 Feb 2014 12:59:27 -0800
Subject: [PATCH 3/8] Changes in response to github comments

---
 src/xray/backends.py    | 10 +++++-----
 src/xray/conventions.py | 32 +++++++++++++++++++-------------
 src/xray/utils.py       | 17 ++++++++---------
 3 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index fa2bb1ac897..3d60a30e400 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -68,10 +68,10 @@ class ScipyDataStore(AbstractDataStore):
     serialization.
     """
     def __init__(self, filename_or_obj, mode='r', mmap=None, version=1,
-                 decode_mask_and_scale=True):
+                 mask_and_scale=True):
         self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None,
                                      version=version)
-        self.decode_mask_and_scale = decode_mask_and_scale
+        self.mask_and_scale = mask_and_scale
 
     @property
     def variables(self):
@@ -129,11 +129,11 @@ def sync(self):
 
 class NetCDF4DataStore(AbstractDataStore):
     def __init__(self, filename, mode='r', clobber=True, diskless=False,
-                 persist=False, format='NETCDF4', decode_mask_and_scale=True):
+                 persist=False, format='NETCDF4', mask_and_scale=True):
         self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
                               diskless=diskless, persist=persist,
                               format=format)
-        self.decode_mask_and_scale = decode_mask_and_scale
+        self.mask_and_scale = mask_and_scale
 
     @property
     def variables(self):
@@ -142,7 +142,7 @@ def convert_variable(var):
             var.set_auto_maskandscale(False)
             return decode_cf_variable(
                 var.dimensions, var, attr, indexing_mode='orthogonal',
-                decode_mask_and_scale=self.decode_mask_and_scale)
+                mask_and_scale=self.mask_and_scale)
         return FrozenOrderedDict((k, convert_variable(v))
                                  for k, v in self.ds.variables.iteritems())
 
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index d14ca39fa93..b645fb1d418 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -17,6 +17,10 @@
                        'int64', 'uint64', 'float' 'real', 'double', 'bool',
                        'string'])
 
+# These data-types aren't supported by netCDF3, so they are automatically
+# coerced instead as indicated by the "coerce_nc3_dtype" function
+_nc3_dtype_coercions = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'}
+
 
 def pretty_print(x, numchars):
     """Given an object x, call x.__str__() and format the returned
@@ -41,9 +45,8 @@ def coerce_nc3_dtype(arr):
     `np.allclose` with the default keyword arguments.
     """
     dtype = str(arr.dtype)
-    dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'}
-    if dtype in dtype_map:
-        new_dtype = dtype_map[dtype]
+    if dtype in _nc3_dtype_coercions:
+        new_dtype = _nc3_dtype_coercions[dtype]
         # TODO: raise a warning whenever casting the data-type instead?
         cast_arr = arr.astype(new_dtype)
         if (('int' in dtype and not (cast_arr == arr).all())
@@ -200,16 +203,19 @@ def encode_cf_variable(array):
         # do).
         data = np.asarray(data).astype(dtype)
 
+    def get_to(source, dest, k):
+        v = source.get(k)
+        dest[k] = v
+        return v
+
     # unscale/mask
     encoding = array.encoding
     if any(k in encoding for k in ['add_offset', 'scale_factor']):
         data = np.array(data, dtype=float, copy=True)
         if 'add_offset' in encoding:
-            data -= encoding['add_offset']
-            attributes['add_offset'] = encoding['add_offset']
+            data -= get_to(encoding, attributes, 'add_offset')
         if 'scale_factor' in encoding:
-            data /= encoding['scale_factor']
-            attributes['add_offset'] = encoding['add_offset']
+            data /= get_to(encoding, attributes, 'scale_factor')
 
     # restore original dtype
     if 'dtype' in encoding:
@@ -219,8 +225,8 @@ def encode_cf_variable(array):
 
 
 def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy',
-                       decode_mask_and_scale=True):
-    def pop_to(k, source, dest):
+                       mask_and_scale=True):
+    def pop_to(source, dest, k):
         v = source.pop(k, None)
         if v is not None:
             dest[k] = v
@@ -228,10 +234,10 @@ def pop_to(k, source, dest):
 
     encoding = {'dtype': data.dtype}
 
-    if decode_mask_and_scale:
-        fill_value = pop_to('_FillValue', attributes, encoding)
-        scale_factor = pop_to('scale_factor', attributes, encoding)
-        add_offset = pop_to('add_offset', attributes, encoding)
+    if mask_and_scale:
+        fill_value = pop_to(attributes, encoding, '_FillValue')
+        scale_factor = pop_to(attributes, encoding, 'scale_factor')
+        add_offset = pop_to(attributes, encoding, 'add_offset')
         if (fill_value is not None or scale_factor is not None
                 or add_offset is not None):
             data = MaskedAndScaledArray(data, fill_value, scale_factor,
diff --git a/src/xray/utils.py b/src/xray/utils.py
index 0e46c5a5830..c8cf26d26e1 100644
--- a/src/xray/utils.py
+++ b/src/xray/utils.py
@@ -166,21 +166,20 @@ def datetimeindex2num(dates, units=None, calendar=None):
 
 
 def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8):
-    """Like np.allclose, but also allows values to NaN in both arrays
+    """Like np.allclose, but also allows values to be NaN in both arrays
     """
     if arr1.shape != arr2.shape:
         return False
     nan_indices = np.isnan(arr1)
     if not (nan_indices == np.isnan(arr2)).all():
         return False
-    else:
-        if arr1.ndim > 0:
-            arr1 = arr1[~nan_indices]
-            arr2 = arr2[~nan_indices]
-        elif nan_indices:
-            # 0-d arrays can't be indexed, so just check if the value is NaN
-            return True
-        return np.allclose(arr1, arr2, rtol=rtol, atol=atol)
+    if arr1.ndim > 0:
+        arr1 = arr1[~nan_indices]
+        arr2 = arr2[~nan_indices]
+    elif nan_indices:
+        # 0-d arrays can't be indexed, so just check if the value is NaN
+        return True
+    return np.allclose(arr1, arr2, rtol=rtol, atol=atol)
 
 
 def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08):

From 4956ffcb8fec8581d6ed2bbd9065353aba5a20ce Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Thu, 27 Feb 2014 00:25:18 -0800
Subject: [PATCH 4/8] Serialize/de-serialize strings

---
 src/xray/backends.py     |  7 ++++
 src/xray/conventions.py  | 73 +++++++++++++++++++++++++++++++++++++++-
 test/test_conventions.py | 32 +++++++++++++++++-
 test/test_dataset.py     | 48 +++++++++++++-------------
 4 files changed, 135 insertions(+), 25 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index 3d60a30e400..9bbfba74d4e 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -31,6 +31,11 @@ def set_variables(self, variables):
         for vn, v in variables.iteritems():
             self.set_variable(vn, v)
 
+    def set_necessary_dimensions(self, variable):
+        for d, l in zip(variable.dimensions, variable.shape):
+            if d not in self.ds.dimensions:
+                self.set_dimension(d, l)
+
 
 class InMemoryDataStore(AbstractDataStore):
     """
@@ -113,6 +118,7 @@ def set_attribute(self, key, value):
     def set_variable(self, name, variable):
         variable = encode_cf_variable(variable)
         data = coerce_nc3_dtype(variable.data)
+        self.set_necessary_dimensions(variable)
         self.ds.createVariable(name, data.dtype, variable.dimensions)
         scipy_var = self.ds.variables[name]
         scipy_var[:] = data[:]
@@ -163,6 +169,7 @@ def set_attribute(self, key, value):
 
     def set_variable(self, name, variable):
         variable = encode_cf_variable(variable)
+        self.set_necessary_dimensions(variable)
         # netCDF4 will automatically assign a fill value
         # depending on the datatype of the variable.  Here
         # we let the package handle the _FillValue attribute
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index b645fb1d418..56f830ddd05 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -1,5 +1,6 @@
 import unicodedata
 
+import netCDF4 as nc4
 import numpy as np
 import pandas as pd
 
@@ -179,12 +180,74 @@ def __repr__(self):
                  self.scale_factor, self.add_offset))
 
 
+class CharToStringArray(object):
+    """Wrapper around array-like objects to create a new indexable object where
+    values, when accessesed, are automatically concatenated along the last
+    dimension
+
+    >>> CharToStringArray(np.array(['a', 'b', 'c']))[:]
+    array('abc',
+          dtype='|S3')
+    """
+    def __init__(self, array):
+        """
+        Parameters
+        ----------
+        array : array-like
+            Original array of values to wrap.
+        """
+        self.array = array
+
+    @property
+    def dtype(self):
+        return np.dtype(str(self.array.dtype)[:2] + str(self.array.shape[-1]))
+
+    @property
+    def shape(self):
+        return self.array.shape[:-1]
+
+    @property
+    def size(self):
+        return np.prod(self.shape)
+
+    @property
+    def ndim(self):
+        return self.array.ndim - 1
+
+    def __len__(self):
+        if self.ndim > 0:
+            return len(self.array)
+        else:
+            raise TypeError('len() of unsized object')
+
+    def __str__(self):
+        if self.ndim == 0:
+            return str(self[...])
+        else:
+            return repr(self)
+
+    def __repr__(self):
+        return '%s(%r)' % (type(self).__name__, self.array)
+
+    def __array__(self):
+        return self[...]
+
+    def __getitem__(self, key):
+        # require slicing the last dimension completely
+        key = utils.expanded_indexer(key, self.array.ndim)
+        if key[-1] != slice(None):
+            raise IndexError('too many indices')
+        return nc4.chartostring(self.array[key])
+
+
 def encode_cf_variable(array):
     """Converts an XArray into an XArray suitable for saving as a netCDF
     variable
     """
+    dimensions = array.dimensions
     data = array.data
     attributes = array.attributes.copy()
+
     if isinstance(data, pd.DatetimeIndex):
         # DatetimeIndex objects need to be encoded into numeric arrays
         (data, units, calendar) = utils.datetimeindex2num(data)
@@ -208,6 +271,11 @@ def get_to(source, dest, k):
         dest[k] = v
         return v
 
+    # encode strings as character arrays
+    if np.issubdtype(data.dtype, (str, unicode)):
+        data = nc4.stringtochar(data)
+        dimensions = dimensions + ('string%s' % data.shape[-1],)
+
     # unscale/mask
     encoding = array.encoding
     if any(k in encoding for k in ['add_offset', 'scale_factor']):
@@ -234,7 +302,10 @@ def pop_to(source, dest, k):
 
     encoding = {'dtype': data.dtype}
 
-    if mask_and_scale:
+    if np.issubdtype(data.dtype, (str, unicode)):
+        dimensions = dimensions[:-1]
+        data = CharToStringArray(data)
+    elif mask_and_scale:
         fill_value = pop_to(attributes, encoding, '_FillValue')
         scale_factor = pop_to(attributes, encoding, 'scale_factor')
         add_offset = pop_to(attributes, encoding, 'add_offset')
diff --git a/test/test_conventions.py b/test/test_conventions.py
index 58daf09eae5..6a748bc6406 100644
--- a/test/test_conventions.py
+++ b/test/test_conventions.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from xray.conventions import MaskedAndScaledArray
+from xray.conventions import MaskedAndScaledArray, CharToStringArray
 from . import TestCase
 
 
@@ -23,3 +23,33 @@ def test(self):
         x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1)
         expected = np.array([np.nan, 0.99, 1, 1.01, 1.02])
         self.assertArrayEqual(expected, x)
+
+
+class TestCharToStringArray(TestCase):
+    def test(self):
+        array = np.array(list('abc'))
+        actual = CharToStringArray(array)
+        expected = np.array('abc')
+        self.assertEqual(actual.dtype, expected.dtype)
+        self.assertEqual(actual.shape, expected.shape)
+        self.assertEqual(actual.size, expected.size)
+        self.assertEqual(actual.ndim, expected.ndim)
+        with self.assertRaises(TypeError):
+            len(actual)
+        self.assertArrayEqual(expected, actual)
+        with self.assertRaises(IndexError):
+            actual[:2]
+        self.assertEqual(str(actual), 'abc')
+
+        array = np.array([list('abc'), list('cdf')])
+        actual = CharToStringArray(array)
+        expected = np.array(['abc', 'cdf'])
+        self.assertEqual(actual.dtype, expected.dtype)
+        self.assertEqual(actual.shape, expected.shape)
+        self.assertEqual(actual.size, expected.size)
+        self.assertEqual(actual.ndim, expected.ndim)
+        self.assertEqual(len(actual), len(expected))
+        self.assertArrayEqual(expected, actual)
+        self.assertArrayEqual(expected[:1], actual[:1])
+        with self.assertRaises(IndexError):
+            actual[:, :2]
diff --git a/test/test_dataset.py b/test/test_dataset.py
index 28029a83590..7e476c6e73e 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -36,6 +36,9 @@ class DataTest(TestCase):
     def get_store(self):
         return backends.InMemoryDataStore()
 
+    def roundtrip(self, data):
+        return data.copy()
+
     def test_repr(self):
         data = create_test_data()
         self.assertEqual('<xray.Dataset (time: 1000, dim1: 100, '
@@ -336,6 +339,15 @@ def test_to_dataframe(self):
         actual = ds.to_dataframe()
         self.assertTrue(expected.equals(actual))
 
+    def test_dump_and_open_dataset(self):
+        for expected in [
+                create_string_data(),
+                create_test_data(),
+                create_masked_and_scaled_data(),
+                open_dataset(os.path.join(_test_data_path, 'example_1.nc'))]:
+            actual = self.roundtrip(expected)
+            self.assertDatasetEqual(expected, actual)
+
 
 def create_masked_and_scaled_data():
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
@@ -344,6 +356,10 @@ def create_masked_and_scaled_data():
     return Dataset({'x': ('t', x, {}, encoding)})
 
 
+def create_string_data():
+    return Dataset({'x': ('t', ['abc', 'def'])})
+
+
 class NetCDF4DataTest(DataTest):
     def get_store(self):
         f, self.tmp_file = tempfile.mkstemp(suffix='.nc')
@@ -354,16 +370,13 @@ def tearDown(self):
         if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file):
             os.remove(self.tmp_file)
 
-    def test_dump_and_open_dataset(self):
-        for data in [create_test_data(), create_masked_and_scaled_data()]:
-            f, tmp_file = tempfile.mkstemp(suffix='.nc')
-            os.close(f)
-            data.dump(tmp_file)
-
-            expected = data.copy()
-            actual = open_dataset(tmp_file)
-            self.assertDatasetEqual(expected, actual)
-            os.remove(tmp_file)
+    def roundtrip(self, data):
+        f, tmp_file = tempfile.mkstemp(suffix='.nc')
+        os.close(f)
+        data.dump(tmp_file)
+        roundtrip_data = open_dataset(tmp_file)
+        os.remove(tmp_file)
+        return roundtrip_data
 
     def test_mask_and_scale(self):
         f, tmp_file = tempfile.mkstemp(suffix='.nc')
@@ -405,20 +418,9 @@ def get_store(self):
         fobj = StringIO()
         return backends.ScipyDataStore(fobj, 'w')
 
-    def test_dump_and_open_dataset(self):
-        for data in [create_test_data(), create_masked_and_scaled_data()]:
-            serialized = data.dumps()
-            expected = data.copy()
-            actual = open_dataset(StringIO(serialized))
-            self.assertDatasetEqual(expected, actual)
-
-    def test_open_and_reopen_existing(self):
-        data = open_dataset(os.path.join(_test_data_path, 'example_1.nc'))
+    def roundtrip(self, data):
         serialized = data.dumps()
-
-        expected = data.copy()
-        actual = open_dataset(StringIO(serialized))
-        self.assertDatasetEqual(expected, actual)
+        return open_dataset(StringIO(serialized))
 
     def test_repr(self):
         # scipy.io.netcdf does not keep track of dimension order :(

From 0f1febfa95b96e6c7dcf1c62791324071f05f055 Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Thu, 27 Feb 2014 14:23:18 -0800
Subject: [PATCH 5/8] More comprehensive Dataset read/write tests

Miscelaneous bugs in mask/scale serialization also fixed.
---
 src/xray/backends.py      |   2 +-
 src/xray/conventions.py   |  30 ++++++++----
 src/xray/dataset.py       |  10 ++--
 src/xray/dataset_array.py |   4 ++
 test/test_dataset.py      | 100 ++++++++++++++++++++++----------------
 5 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index 9bbfba74d4e..c60a071947c 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -174,7 +174,7 @@ def set_variable(self, name, variable):
         # depending on the datatype of the variable.  Here
         # we let the package handle the _FillValue attribute
         # instead of setting it ourselves.
-        fill_value = variable.encoding.get('_FillValue')
+        fill_value = variable.attributes.pop('_FillValue', None)
         self.ds.createVariable(varname=name,
                                datatype=variable.dtype,
                                dimensions=variable.dimensions,
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index 56f830ddd05..ca29a922f44 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -247,6 +247,7 @@ def encode_cf_variable(array):
     dimensions = array.dimensions
     data = array.data
     attributes = array.attributes.copy()
+    encoding = array.encoding
 
     if isinstance(data, pd.DatetimeIndex):
         # DatetimeIndex objects need to be encoded into numeric arrays
@@ -277,7 +278,6 @@ def get_to(source, dest, k):
         dimensions = dimensions + ('string%s' % data.shape[-1],)
 
     # unscale/mask
-    encoding = array.encoding
     if any(k in encoding for k in ['add_offset', 'scale_factor']):
         data = np.array(data, dtype=float, copy=True)
         if 'add_offset' in encoding:
@@ -285,27 +285,37 @@ def get_to(source, dest, k):
         if 'scale_factor' in encoding:
             data /= get_to(encoding, attributes, 'scale_factor')
 
+    # replace NaN with the fill value
+    if '_FillValue' in encoding:
+        if encoding['_FillValue'] is np.nan:
+            attributes['_FillValue'] = np.nan
+        else:
+            nans = np.isnan(data)
+            if nans.any():
+                data[nans] = get_to(encoding, attributes, '_FillValue')
+
     # restore original dtype
     if 'dtype' in encoding:
         data = data.astype(encoding['dtype'])
 
-    return xarray.XArray(array.dimensions, data, attributes, encoding)
+    return xarray.XArray(dimensions, data, attributes, encoding=encoding)
 
 
 def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy',
                        mask_and_scale=True):
-    def pop_to(source, dest, k):
-        v = source.pop(k, None)
-        if v is not None:
-            dest[k] = v
-        return v
-
     encoding = {'dtype': data.dtype}
-
     if np.issubdtype(data.dtype, (str, unicode)):
+        # TODO: add some sort of check instead of just assuming that the last
+        # dimension on a character array is always the string dimension
         dimensions = dimensions[:-1]
         data = CharToStringArray(data)
     elif mask_and_scale:
+        def pop_to(source, dest, k):
+            v = source.pop(k, None)
+            if v is not None:
+                dest[k] = v
+            return v
+
         fill_value = pop_to(attributes, encoding, '_FillValue')
         scale_factor = pop_to(attributes, encoding, 'scale_factor')
         add_offset = pop_to(attributes, encoding, 'add_offset')
@@ -314,4 +324,6 @@ def pop_to(source, dest, k):
             data = MaskedAndScaledArray(data, fill_value, scale_factor,
                                         add_offset)
 
+    # TODO: decode arrays with time units into np.datetime64 objects here
+
     return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding)
diff --git a/src/xray/dataset.py b/src/xray/dataset.py
index d4ed0c7f4a3..0291174553b 100644
--- a/src/xray/dataset.py
+++ b/src/xray/dataset.py
@@ -316,25 +316,23 @@ def noncoordinates(self):
 
     def dump_to_store(self, store):
         """Store dataset contents to a backends.*DataStore object."""
-        store.set_dimensions(self.dimensions)
         store.set_variables(self.variables)
         store.set_attributes(self.attributes)
         store.sync()
 
-    def dump(self, filepath, *args, **kwdargs):
+    def dump(self, filepath, **kwdargs):
         """Dump dataset contents to a location on disk using the netCDF4
         package.
         """
-        nc4_store = backends.NetCDF4DataStore(filepath, mode='w',
-                                              *args, **kwdargs)
+        nc4_store = backends.NetCDF4DataStore(filepath, mode='w', **kwdargs)
         self.dump_to_store(nc4_store)
 
-    def dumps(self):
+    def dumps(self, **kwargs):
         """Serialize dataset contents to a string. The serialization creates an
         in memory netcdf version 3 string using the scipy.io.netcdf package.
         """
         fobj = StringIO()
-        scipy_store = backends.ScipyDataStore(fobj, mode='w')
+        scipy_store = backends.ScipyDataStore(fobj, mode='w', **kwargs)
         self.dump_to_store(scipy_store)
         return fobj.getvalue()
 
diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py
index a12b624b203..748718b0d2c 100644
--- a/src/xray/dataset_array.py
+++ b/src/xray/dataset_array.py
@@ -129,6 +129,10 @@ def __iter__(self):
     def attributes(self):
         return self.array.attributes
 
+    @property
+    def encoding(self):
+        return self.array.encoding
+
     @property
     def variables(self):
         return self.dataset.variables
diff --git a/test/test_dataset.py b/test/test_dataset.py
index 7e476c6e73e..92ab782b8df 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -32,13 +32,7 @@ def create_test_data():
     return obj
 
 
-class DataTest(TestCase):
-    def get_store(self):
-        return backends.InMemoryDataStore()
-
-    def roundtrip(self, data):
-        return data.copy()
-
+class TestDataset(TestCase):
     def test_repr(self):
         data = create_test_data()
         self.assertEqual('<xray.Dataset (time: 1000, dim1: 100, '
@@ -309,13 +303,6 @@ def test_setitem(self):
         with self.assertRaisesRegexp(TypeError, 'variables must be of type'):
             data2['C'] = var.data
 
-    def test_write_store(self):
-        expected = create_test_data()
-        store = self.get_store()
-        expected.dump_to_store(store)
-        actual = Dataset.load_store(store)
-        self.assertDatasetEqual(expected, actual)
-
     def test_to_dataframe(self):
         x = np.random.randn(10)
         y = np.random.randn(10)
@@ -339,15 +326,6 @@ def test_to_dataframe(self):
         actual = ds.to_dataframe()
         self.assertTrue(expected.equals(actual))
 
-    def test_dump_and_open_dataset(self):
-        for expected in [
-                create_string_data(),
-                create_test_data(),
-                create_masked_and_scaled_data(),
-                open_dataset(os.path.join(_test_data_path, 'example_1.nc'))]:
-            actual = self.roundtrip(expected)
-            self.assertDatasetEqual(expected, actual)
-
 
 def create_masked_and_scaled_data():
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
@@ -356,11 +334,58 @@ def create_masked_and_scaled_data():
     return Dataset({'x': ('t', x, {}, encoding)})
 
 
-def create_string_data():
-    return Dataset({'x': ('t', ['abc', 'def'])})
+def create_encoded_masked_and_scaled_data():
+    attributes = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1}
+    return Dataset({'x': XArray('t', [-1, -1, 0, 1, 2], attributes)})
+
+
+class DatasetIOCases(object):
+    def get_store(self):
+        raise NotImplementedError
+
+    def roundtrip(self, data, **kwargs):
+        raise NotImplementedError
+
+    def test_write_store(self):
+        expected = create_test_data()
+        store = self.get_store()
+        expected.dump_to_store(store)
+        actual = Dataset.load_store(store)
+        self.assertDatasetEqual(expected, actual)
+
+    def test_roundtrip_test_data(self):
+        expected = create_test_data()
+        actual = self.roundtrip(expected)
+        self.assertDatasetEqual(expected, actual)
+
+    def test_roundtrip_string_data(self):
+        expected = Dataset({'x': ('t', ['abc', 'def'])})
+        actual = self.roundtrip(expected)
+        self.assertDatasetEqual(expected, actual)
 
+    def test_roundtrip_mask_and_scale(self):
+        expected = create_masked_and_scaled_data()
+        actual = self.roundtrip(expected)
+        self.assertDatasetEqual(expected, actual)
+
+    def test_roundtrip_mask_and_scale_False(self):
+        expected = create_masked_and_scaled_data()
+        actual = self.roundtrip(expected, mask_and_scale=False)
+        self.assertDatasetEqual(expected, actual)
 
-class NetCDF4DataTest(DataTest):
+    def test_roundtrip_example_1_netcdf(self):
+        expected = open_dataset(os.path.join(_test_data_path, 'example_1.nc'))
+        actual = self.roundtrip(expected)
+        self.assertDatasetEqual(expected, actual)
+
+    def test_encoded_masked_and_scaled_data(self):
+        original = create_encoded_masked_and_scaled_data()
+        actual = self.roundtrip(original, mask_and_scale=True)
+        expected = create_masked_and_scaled_data()
+        self.assertDatasetEqual(expected, actual)
+
+
+class NetCDF4DataTest(DatasetIOCases, TestCase):
     def get_store(self):
         f, self.tmp_file = tempfile.mkstemp(suffix='.nc')
         os.close(f)
@@ -370,11 +395,11 @@ def tearDown(self):
         if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file):
             os.remove(self.tmp_file)
 
-    def roundtrip(self, data):
+    def roundtrip(self, data, **kwargs):
         f, tmp_file = tempfile.mkstemp(suffix='.nc')
         os.close(f)
         data.dump(tmp_file)
-        roundtrip_data = open_dataset(tmp_file)
+        roundtrip_data = open_dataset(tmp_file, **kwargs)
         os.remove(tmp_file)
         return roundtrip_data
 
@@ -399,29 +424,18 @@ def test_mask_and_scale(self):
         actual = nc.variables['x'][:]
         self.assertArrayEqual(expected, actual)
 
-        def replace_nan(x):
-            # replace NaN with a sentinel value because xarray_equal is not
-            # NaN safe
-            x[np.isnan(x)] = -9999
-            return x
-
         # now check xray
         ds = open_dataset(tmp_file)
         expected = create_masked_and_scaled_data()
-        self.assertDatasetEqual(replace_nan(expected['x']).dataset,
-                                replace_nan(ds['x']).dataset)
+        self.assertDatasetEqual(expected, ds)
         os.remove(tmp_file)
 
 
-class ScipyDataTest(DataTest):
+class ScipyDataTest(DatasetIOCases, TestCase):
     def get_store(self):
         fobj = StringIO()
         return backends.ScipyDataStore(fobj, 'w')
 
-    def roundtrip(self, data):
+    def roundtrip(self, data, **kwargs):
         serialized = data.dumps()
-        return open_dataset(StringIO(serialized))
-
-    def test_repr(self):
-        # scipy.io.netcdf does not keep track of dimension order :(
-        pass
+        return open_dataset(StringIO(serialized), **kwargs)

From 8b5101dd11324a298abad8054140e28a548570a0 Mon Sep 17 00:00:00 2001
From: Alex Kleeman <kleeman@climate.com>
Date: Thu, 27 Feb 2014 17:34:04 -0800
Subject: [PATCH 6/8] CF Time coordinates are handled using encodings.

Datasets now have an optional constructor argument which
determines whether CF-variables are converted or stored raw.
---
 src/xray/backends.py    | 31 ++++++++---------
 src/xray/conventions.py | 49 ++++++++++++++++++++-------
 src/xray/dataset.py     | 23 +++++++------
 src/xray/xarray.py      | 16 ++++-----
 test/test_dataset.py    | 73 +++++++++++++++++++++++++++++++++++------
 5 files changed, 133 insertions(+), 59 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index c60a071947c..31dfa64d3b7 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -5,17 +5,16 @@
 """
 # TODO: implement backend logic directly in OrderedDict subclasses, to allow
 # for directly manipulating Dataset.variables and the like?
-import netCDF4 as nc4
 import numpy as np
-import pandas as pd
+import netCDF4 as nc4
 
 from scipy.io import netcdf
 from collections import OrderedDict
 
 import xarray
-from conventions import (decode_cf_variable, encode_cf_variable,
-                         is_valid_nc3_name, coerce_nc3_dtype)
-from utils import FrozenOrderedDict, Frozen, datetimeindex2num
+
+from utils import FrozenOrderedDict, Frozen
+from conventions import is_valid_nc3_name, coerce_nc3_dtype, encode_cf_variable
 
 
 class AbstractDataStore(object):
@@ -72,16 +71,14 @@ class ScipyDataStore(AbstractDataStore):
     be initialized with a StringIO object, allow for
     serialization.
     """
-    def __init__(self, filename_or_obj, mode='r', mmap=None, version=1,
-                 mask_and_scale=True):
-        self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None,
+    def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
+        self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=mmap,
                                      version=version)
-        self.mask_and_scale = mask_and_scale
 
     @property
     def variables(self):
-        return FrozenOrderedDict((k, decode_cf_variable(v.dimensions, v.data,
-                                                        v._attributes))
+        return FrozenOrderedDict((k, xarray.XArray(v.dimensions, v.data,
+                                            v._attributes))
                                  for k, v in self.ds.variables.iteritems())
 
     @property
@@ -134,21 +131,20 @@ def sync(self):
 
 
 class NetCDF4DataStore(AbstractDataStore):
+
     def __init__(self, filename, mode='r', clobber=True, diskless=False,
-                 persist=False, format='NETCDF4', mask_and_scale=True):
+                 persist=False, format='NETCDF4'):
         self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
                               diskless=diskless, persist=persist,
                               format=format)
-        self.mask_and_scale = mask_and_scale
 
     @property
     def variables(self):
         def convert_variable(var):
             attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs())
             var.set_auto_maskandscale(False)
-            return decode_cf_variable(
-                var.dimensions, var, attr, indexing_mode='orthogonal',
-                mask_and_scale=self.mask_and_scale)
+            return xarray.XArray(var.dimensions, var,
+                          attr, indexing_mode='orthogonal')
         return FrozenOrderedDict((k, convert_variable(v))
                                  for k, v in self.ds.variables.iteritems())
 
@@ -159,7 +155,8 @@ def attributes(self):
 
     @property
     def dimensions(self):
-        return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems())
+        return FrozenOrderedDict((k, len(v))
+                                 for k, v in self.ds.dimensions.iteritems())
 
     def set_dimension(self, name, length):
         self.ds.createDimension(name, size=length)
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index ca29a922f44..cd18b988934 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -251,7 +251,9 @@ def encode_cf_variable(array):
 
     if isinstance(data, pd.DatetimeIndex):
         # DatetimeIndex objects need to be encoded into numeric arrays
-        (data, units, calendar) = utils.datetimeindex2num(data)
+        (data, units, calendar) = utils.datetimeindex2num(data,
+                                          units=encoding.get('units', None),
+                                          calendar=encoding.get('calendar', None))
         attributes['units'] = units
         attributes['calendar'] = calendar
     elif data.dtype == np.dtype('O'):
@@ -301,21 +303,36 @@ def get_to(source, dest, k):
     return xarray.XArray(dimensions, data, attributes, encoding=encoding)
 
 
-def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy',
-                       mask_and_scale=True):
-    encoding = {'dtype': data.dtype}
+def decode_cf_variable(var, mask_and_scale=True):
+    data = var.data
+    dimensions = var.dimensions
+    attributes = var.attributes.copy()
+    encoding = var.encoding.copy()
+    indexing_mode = var._indexing_mode
+
+    def pop_to(source, dest, k):
+        """
+        A convenience function which pops a key k from source to dest.
+        None values are not passed on.  If k already exists in dest an
+        error is raised.
+        """
+        v = source.pop(k, None)
+        if v is not None:
+            if k in dest:
+                raise ValueError("Failed hard to prevent overwriting key %s" % k)
+            dest[k] = v
+        return v
+
+    if 'dtype' in encoding:
+        if not var.data.dtype == encoding.dtype:
+            raise ValueError("Refused to overwrite dtype")
+    encoding['dtype'] = data.dtype
     if np.issubdtype(data.dtype, (str, unicode)):
         # TODO: add some sort of check instead of just assuming that the last
         # dimension on a character array is always the string dimension
         dimensions = dimensions[:-1]
         data = CharToStringArray(data)
     elif mask_and_scale:
-        def pop_to(source, dest, k):
-            v = source.pop(k, None)
-            if v is not None:
-                dest[k] = v
-            return v
-
         fill_value = pop_to(attributes, encoding, '_FillValue')
         scale_factor = pop_to(attributes, encoding, 'scale_factor')
         add_offset = pop_to(attributes, encoding, 'add_offset')
@@ -323,7 +340,15 @@ def pop_to(source, dest, k):
                 or add_offset is not None):
             data = MaskedAndScaledArray(data, fill_value, scale_factor,
                                         add_offset)
-
-    # TODO: decode arrays with time units into np.datetime64 objects here
+    # TODO: How should multidimensional time variables be handled?
+    if (data.ndim == 1 and
+        'units' in attributes and
+        'since' in attributes['units']):
+        # convert times to datetime indices.  We only do this if the dimension
+        # is one, since otherwise it can't be a coordinate.
+        units = pop_to(attributes, encoding, 'units')
+        calendar = pop_to(attributes, encoding, 'calendar')
+        data = utils.num2datetimeindex(data, units=units,
+                                       calendar=calendar)
 
     return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding)
diff --git a/src/xray/dataset.py b/src/xray/dataset.py
index 0291174553b..926839c0cd2 100644
--- a/src/xray/dataset.py
+++ b/src/xray/dataset.py
@@ -17,7 +17,7 @@
 num2date = nc4.num2date
 
 
-def open_dataset(nc, *args, **kwargs):
+def open_dataset(nc, decode_cf=True, *args, **kwargs):
     """Open the dataset given the object or path `nc`.
 
     *args and **kwargs provide format specific options
@@ -32,7 +32,7 @@ def open_dataset(nc, *args, **kwargs):
         # If nc is a file-like object we read it using
         # the scipy.io.netcdf package
         store = backends.ScipyDataStore(nc, *args, **kwargs)
-    return Dataset.load_store(store)
+    return Dataset.load_store(store, decode_cf=decode_cf)
 
 
 # list of attributes of pd.DatetimeIndex that are ndarrays of time info
@@ -101,7 +101,7 @@ class Dataset(Mapping):
 
     Note: the size of dimensions in a dataset cannot be changed.
     """
-    def __init__(self, variables=None, attributes=None):
+    def __init__(self, variables=None, attributes=None, decode_cf=False):
         """To load data from a file or file-like object, use the `open_dataset`
         function.
 
@@ -118,6 +118,7 @@ def __init__(self, variables=None, attributes=None):
         attributes : dict-like, optional
             Global attributes to save on this dataset.
         """
+        self._decode_cf = decode_cf
         self._variables = _VariablesDict()
         self._dimensions = OrderedDict()
         if variables is not None:
@@ -134,18 +135,16 @@ def _as_variable(self, name, var):
                 raise TypeError('Dataset variables must be of type '
                                 'DatasetArray or XArray, or a sequence of the '
                                 'form (dimensions, data[, attributes])')
-
+        # this will unmask and rescale the data as well as convert
+        # time variables to datetime indices.
+        if self._decode_cf:
+            var = conventions.decode_cf_variable(var)
         if name in var.dimensions:
             # convert the coordinate into a pandas.Index
             if var.ndim != 1:
                 raise ValueError('a coordinate variable must be defined with '
                                  '1-dimensional data')
-            attr = var.attributes
-            if 'units' in attr and 'since' in attr['units']:
-                var.data = utils.num2datetimeindex(var.data, attr.pop('units'),
-                                                   attr.pop('calendar', None))
-            else:
-                var.data = pd.Index(var.data)
+            var.data = pd.Index(var.data)
         return var
 
     def _set_variables(self, variables):
@@ -169,8 +168,8 @@ def _set_variables(self, variables):
         self._variables.update(new_variables)
 
     @classmethod
-    def load_store(cls, store):
-        return cls(store.variables, store.attributes)
+    def load_store(cls, store, decode_cf=True):
+        return cls(store.variables, store.attributes, decode_cf)
 
     @property
     def variables(self):
diff --git a/src/xray/xarray.py b/src/xray/xarray.py
index 693d0ae2420..e0e1bd40a5a 100644
--- a/src/xray/xarray.py
+++ b/src/xray/xarray.py
@@ -1,16 +1,16 @@
 import functools
-import warnings
-from collections import OrderedDict
-from itertools import izip
-
 import numpy as np
 
-import conventions
-import dataset
-import dataset_array
-import groupby
+from itertools import izip
+from collections import OrderedDict
+
 import ops
 import utils
+import dataset
+import groupby
+import conventions
+import dataset_array
+
 from common import AbstractArray
 
 
diff --git a/test/test_dataset.py b/test/test_dataset.py
index 92ab782b8df..679cc4c04ab 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -13,14 +13,15 @@
 
 _test_data_path = os.path.join(os.path.dirname(__file__), 'data')
 
-_dims = {'dim1':100, 'dim2':50, 'dim3':10}
-_vars = {'var1':['dim1', 'dim2'],
-         'var2':['dim1', 'dim2'],
-         'var3':['dim3', 'dim1'],
+_dims = {'dim1': 100, 'dim2': 50, 'dim3': 10}
+_vars = {'var1': ['dim1', 'dim2'],
+         'var2': ['dim1', 'dim2'],
+         'var3': ['dim3', 'dim1'],
          }
 _testvar = sorted(_vars.keys())[0]
 _testdim = sorted(_dims.keys())[0]
 
+
 def create_test_data():
     obj = Dataset()
     obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000))
@@ -183,8 +184,7 @@ def test_labeled_by(self):
         loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)}
         self.assertEqual(data.indexed_by(**int_slicers),
                          data.labeled_by(**loc_slicers))
-        data['time'] = ('time', np.arange(1000, dtype=np.int32),
-                        {'units': 'days since 2000-01-01'})
+        data['time'] = ('time', pd.date_range('2000-01-01', periods=1000))
         self.assertEqual(data.indexed_by(time=0),
                          data.labeled_by(time='2000-01-01'))
         self.assertEqual(data.indexed_by(time=slice(10)),
@@ -274,8 +274,7 @@ def test_merge(self):
 
     def test_getitem(self):
         data = create_test_data()
-        data['time'] = ('time', np.arange(1000, dtype=np.int32),
-                        {'units': 'days since 2000-01-01'})
+        data['time'] = ('time', pd.date_range('2000-01-01', periods=1000))
         self.assertIsInstance(data['var1'], DatasetArray)
         self.assertXArrayEqual(data['var1'], data.variables['var1'])
         self.assertItemsEqual(data['var1'].dataset.variables,
@@ -370,7 +369,7 @@ def test_roundtrip_mask_and_scale(self):
 
     def test_roundtrip_mask_and_scale_False(self):
         expected = create_masked_and_scaled_data()
-        actual = self.roundtrip(expected, mask_and_scale=False)
+        actual = self.roundtrip(expected, decode_cf=False)
         self.assertDatasetEqual(expected, actual)
 
     def test_roundtrip_example_1_netcdf(self):
@@ -380,7 +379,7 @@ def test_roundtrip_example_1_netcdf(self):
 
     def test_encoded_masked_and_scaled_data(self):
         original = create_encoded_masked_and_scaled_data()
-        actual = self.roundtrip(original, mask_and_scale=True)
+        actual = self.roundtrip(original, decode_cf=True)
         expected = create_masked_and_scaled_data()
         self.assertDatasetEqual(expected, actual)
 
@@ -403,6 +402,60 @@ def roundtrip(self, data, **kwargs):
         os.remove(tmp_file)
         return roundtrip_data
 
+    def test_open_encodings(self):
+        # Create a netCDF file with explicit time units
+        # and make sure it makes it into the encodings
+        # and survives a round trip
+        f, tmp_file = tempfile.mkstemp(suffix='.nc')
+        os.close(f)
+
+        ds = nc4.Dataset(tmp_file, 'w')
+        ds.createDimension('time', size=10)
+        ds.createVariable('time', np.int32, dimensions=('time',))
+        units = 'days since 1999-01-01'
+        ds.variables['time'].setncattr('units', units)
+        ds.variables['time'][:] = np.arange(10) + 4
+        ds.close()
+
+        expected = Dataset()
+        expected['time'] = ('time', pd.date_range('1999-01-05', periods=10))
+        expected['time'].encoding['units'] = units
+        expected['time'].encoding['dtype'] = np.dtype('int32')
+
+        actual = open_dataset(tmp_file)
+
+        self.assertXArrayEqual(actual['time'], expected['time'])
+        self.assertDictEqual(actual['time'].encoding, expected['time'].encoding)
+
+        os.remove(tmp_file)
+
+    def test_dump_and_open_encodings(self):
+        # Create a netCDF file with explicit time units
+        # and make sure it makes it into the encodings
+        # and survives a round trip
+        f, tmp_file = tempfile.mkstemp(suffix='.nc')
+        os.close(f)
+
+        ds = nc4.Dataset(tmp_file, 'w')
+        ds.createDimension('time', size=10)
+        ds.createVariable('time', np.int32, dimensions=('time',))
+        units = 'days since 1999-01-01'
+        ds.variables['time'].setncattr('units', units)
+        ds.variables['time'][:] = np.arange(10) + 4
+        ds.close()
+
+        xray_dataset = open_dataset(tmp_file)
+        os.remove(tmp_file)
+        xray_dataset.dump(tmp_file)
+
+        ds = nc4.Dataset(tmp_file, 'r')
+
+        self.assertEqual(ds.variables['time'].getncattr('units'), units)
+        self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)
+
+        ds.close()
+        os.remove(tmp_file)
+
     def test_mask_and_scale(self):
         f, tmp_file = tempfile.mkstemp(suffix='.nc')
         os.close(f)

From b9b4b0f184b3293e062df6da8cc87d208582e3a8 Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Fri, 28 Feb 2014 10:13:16 -0800
Subject: [PATCH 7/8] Fixed my issues with Alex's commit

---
 src/xray/backends.py    |  2 +-
 src/xray/conventions.py | 11 ++++-----
 src/xray/dataset.py     | 49 +++++++++++++++++++++++++++++------------
 src/xray/utils.py       | 13 +++++++++--
 src/xray/xarray.py      | 28 +++++++++++------------
 test/test_dataset.py    | 36 ++++++++++++++----------------
 test/test_utils.py      | 18 +++++++--------
 7 files changed, 91 insertions(+), 66 deletions(-)

diff --git a/src/xray/backends.py b/src/xray/backends.py
index 31dfa64d3b7..61d0dbded88 100644
--- a/src/xray/backends.py
+++ b/src/xray/backends.py
@@ -78,7 +78,7 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
     @property
     def variables(self):
         return FrozenOrderedDict((k, xarray.XArray(v.dimensions, v.data,
-                                            v._attributes))
+                                                   v._attributes))
                                  for k, v in self.ds.variables.iteritems())
 
     @property
diff --git a/src/xray/conventions.py b/src/xray/conventions.py
index cd18b988934..773bd5bf188 100644
--- a/src/xray/conventions.py
+++ b/src/xray/conventions.py
@@ -298,6 +298,8 @@ def get_to(source, dest, k):
 
     # restore original dtype
     if 'dtype' in encoding:
+        if np.issubdtype(encoding['dtype'], int):
+            data = data.round()
         data = data.astype(encoding['dtype'])
 
     return xarray.XArray(dimensions, data, attributes, encoding=encoding)
@@ -308,7 +310,6 @@ def decode_cf_variable(var, mask_and_scale=True):
     dimensions = var.dimensions
     attributes = var.attributes.copy()
     encoding = var.encoding.copy()
-    indexing_mode = var._indexing_mode
 
     def pop_to(source, dest, k):
         """
@@ -324,7 +325,7 @@ def pop_to(source, dest, k):
         return v
 
     if 'dtype' in encoding:
-        if not var.data.dtype == encoding.dtype:
+        if var.data.dtype != encoding['dtype']:
             raise ValueError("Refused to overwrite dtype")
     encoding['dtype'] = data.dtype
     if np.issubdtype(data.dtype, (str, unicode)):
@@ -342,8 +343,8 @@ def pop_to(source, dest, k):
                                         add_offset)
     # TODO: How should multidimensional time variables be handled?
     if (data.ndim == 1 and
-        'units' in attributes and
-        'since' in attributes['units']):
+            'units' in attributes and
+            'since' in attributes['units']):
         # convert times to datetime indices.  We only do this if the dimension
         # is one, since otherwise it can't be a coordinate.
         units = pop_to(attributes, encoding, 'units')
@@ -351,4 +352,4 @@ def pop_to(source, dest, k):
         data = utils.num2datetimeindex(data, units=units,
                                        calendar=calendar)
 
-    return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding)
+    return xarray.XArray(dimensions, data, attributes, encoding=encoding)
diff --git a/src/xray/dataset.py b/src/xray/dataset.py
index 926839c0cd2..1544d87c6bf 100644
--- a/src/xray/dataset.py
+++ b/src/xray/dataset.py
@@ -117,17 +117,18 @@ def __init__(self, variables=None, attributes=None, decode_cf=False):
             `pandas.Index` objects.
         attributes : dict-like, optional
             Global attributes to save on this dataset.
+        decode_cf : bool, optional
+            Whether to decode these variables according to CF conventions.
         """
-        self._decode_cf = decode_cf
         self._variables = _VariablesDict()
         self._dimensions = OrderedDict()
         if variables is not None:
-            self._set_variables(variables)
+            self.set_variables(variables, decode_cf=decode_cf)
         if attributes is None:
             attributes = {}
         self._attributes = OrderedDict(attributes)
 
-    def _as_variable(self, name, var):
+    def _as_variable(self, name, var, decode_cf=False):
         if not isinstance(var, xarray.XArray):
             try:
                 var = xarray.XArray(*var)
@@ -137,23 +138,43 @@ def _as_variable(self, name, var):
                                 'form (dimensions, data[, attributes])')
         # this will unmask and rescale the data as well as convert
         # time variables to datetime indices.
-        if self._decode_cf:
+        if decode_cf:
             var = conventions.decode_cf_variable(var)
         if name in var.dimensions:
             # convert the coordinate into a pandas.Index
             if var.ndim != 1:
                 raise ValueError('a coordinate variable must be defined with '
                                  '1-dimensional data')
-            var.data = pd.Index(var.data)
+            # create a new XArray object on which to modify the data
+            var = xarray.XArray(var.dimensions, pd.Index(var.data),
+                                var.attributes, encoding=var.encoding)
         return var
 
-    def _set_variables(self, variables):
-        """Set a mapping of variables and update dimensions"""
+    def set_variables(self, variables, decode_cf=False):
+        """Set a mapping of variables and update dimensions.
+
+        Parameters
+        ----------
+        variables : dict-like, optional
+            A mapping from variable names to `XArray` objects or sequences of
+            the form `(dimensions, data[, attributes])` which can be used as
+            arguments to create a new `XArray`. Each dimension must have the
+            same length in all variables in which it appears. One dimensional
+            variables with name equal to their dimension are coordinate
+            variables, which means they are saved in the dataset as
+            `pandas.Index` objects.
+        decode_cf : bool, optional
+            Whether to decode these variables according to CF conventions.
+
+        Returns
+        -------
+        None
+        """
         # save new variables into a temporary list so all the error checking
         # can be done before updating _variables
         new_variables = []
         for k, var in variables.iteritems():
-            var = self._as_variable(k, var)
+            var = self._as_variable(k, var, decode_cf=decode_cf)
             for dim, size in zip(var.dimensions, var.shape):
                 if dim not in self._dimensions:
                     self._dimensions[dim] = size
@@ -169,7 +190,7 @@ def _set_variables(self, variables):
 
     @classmethod
     def load_store(cls, store, decode_cf=True):
-        return cls(store.variables, store.attributes, decode_cf)
+        return cls(store.variables, store.attributes, decode_cf=decode_cf)
 
     @property
     def variables(self):
@@ -264,7 +285,7 @@ def __setitem__(self, key, value):
             # TODO: should remove key from this dataset if it already exists
             self.merge(value.renamed(key).dataset, inplace=True)
         else:
-            self._set_variables({key: value})
+            self.set_variables({key: value})
 
     def __delitem__(self, key):
         """Remove a variable from this dataset.
@@ -480,7 +501,7 @@ def renamed(self, name_dict):
             #TODO: public interface for renaming a variable without loading
             # data?
             variables[name] = xarray.XArray(dims, v._data, v.attributes,
-                                            v._indexing_mode)
+                                            v.encoding, v._indexing_mode)
 
         return type(self)(variables, self.attributes)
 
@@ -514,9 +535,9 @@ def merge(self, other, inplace=False):
                                   compat=utils.xarray_equal)
         # update contents
         obj = self if inplace else self.copy()
-        obj._set_variables(OrderedDict((k, v) for k, v
-                                       in other.variables.iteritems()
-                                       if k not in obj.variables))
+        obj.set_variables(OrderedDict((k, v) for k, v
+                                      in other.variables.iteritems()
+                                      if k not in obj.variables))
         # remove conflicting attributes
         for k, v in other.attributes.iteritems():
             if k in self.attributes and v != self.attributes[k]:
diff --git a/src/xray/utils.py b/src/xray/utils.py
index c8cf26d26e1..a13812e242d 100644
--- a/src/xray/utils.py
+++ b/src/xray/utils.py
@@ -113,6 +113,7 @@ def num2datetimeindex(num_dates, units, calendar=None):
     For standard (Gregorian) calendars, this function uses vectorized
     operations, which makes it much faster than netCDF4.num2date.
     """
+    # TODO: fix this function so it works on arbitrary n-dimensional arrays
     num_dates = np.asarray(num_dates)
     if calendar is None:
         calendar = 'standard'
@@ -257,14 +258,15 @@ def remove_incompatible_items(first_dict, second_dict, compat=operator.eq):
         if k in first_dict and not compat(v, first_dict[k]):
             del first_dict[k]
 
+
 def dict_equal(first, second):
-    """ Test equality of two dict-like objects.  If any of the values
+    """Test equality of two dict-like objects.  If any of the values
     are numpy arrays, compare them for equality correctly.
 
     Parameters
     ----------
     first, second : dict-like
-        dictionaries to compare for equality
+        Dictionaries to compare for equality
 
     Returns
     -------
@@ -285,6 +287,13 @@ def dict_equal(first, second):
                 return False
         elif v1 != v2:
             return False
+        if isinstance(v1, np.ndarray) != isinstance(v2, np.ndarray):
+            return False # one is an ndarray, other is not
+        elif (isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray)):
+            if not np.array_equal(v1, v2):
+                return False
+        elif v1 != v2:
+            return False
     return True
 
 def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq):
diff --git a/src/xray/xarray.py b/src/xray/xarray.py
index e0e1bd40a5a..41ae20867a6 100644
--- a/src/xray/xarray.py
+++ b/src/xray/xarray.py
@@ -38,8 +38,8 @@ class XArray(AbstractArray):
     outside the context of its parent Dataset (if you want such a fully
     described object, use a DatasetArray instead).
     """
-    def __init__(self, dims, data, attributes=None, indexing_mode='numpy',
-                 encoding=None):
+    def __init__(self, dims, data, attributes=None, encoding=None,
+                 indexing_mode='numpy'):
         """
         Parameters
         ----------
@@ -52,6 +52,11 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy',
         attributes : dict_like or None, optional
             Attributes to assign to the new variable. If None (default), an
             empty attribute dictionary is initialized.
+        encoding : dict_like or None, optional
+            Dictionary specifying how to encode this array's data into a
+            serialized format like netCDF4. Currently used keys (for netCDF)
+            include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'.
+            Well behaviored code to serialize an XArray should ignore
         indexing_mode : {'numpy', 'orthogonal'}
             String indicating how the data parameter handles fancy indexing
             (with arrays). Two modes are supported: 'numpy' (fancy indexing
@@ -60,11 +65,6 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy',
             variables). Accessing data from an XArray always uses orthogonal
             indexing, so `indexing_mode` tells the variable whether index
             lookups need to be internally converted to numpy-style indexing.
-        encoding : dict_like or None, optional
-            Dictionary specifying how to encode this array's data into a
-            serialized format like netCDF4. Currently used keys (for netCDF)
-            include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'.
-            Well behaviored code to serialize an XArray should ignore
             unrecognized keys in this dictionary.
         """
         if isinstance(dims, basestring):
@@ -77,8 +77,8 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy',
         if attributes is None:
             attributes = {}
         self._attributes = OrderedDict(attributes)
-        self._indexing_mode = indexing_mode
         self.encoding = dict({} if encoding is None else encoding)
+        self._indexing_mode = indexing_mode
 
     @property
     def data(self):
@@ -157,8 +157,7 @@ def __getitem__(self, key):
         # return a variable with the same indexing_mode, because data should
         # still be the same type as _data
         return type(self)(dimensions, new_data, self.attributes,
-                          indexing_mode=self._indexing_mode,
-                          encoding=self.encoding)
+                          self.encoding, self._indexing_mode)
 
     def __setitem__(self, key, value):
         """__setitem__ is overloaded to access the underlying numpy data with
@@ -191,7 +190,7 @@ def _copy(self, deepcopy=False):
         # dimensions is already an immutable tuple
         # attributes will be copied when the new Array is created
         return type(self)(self.dimensions, data, self.attributes,
-                          encoding=self.encoding)
+                          self.encoding)
 
     def __copy__(self):
         return self._copy(deepcopy=False)
@@ -284,8 +283,7 @@ def transpose(self, *dimensions):
             dimensions = self.dimensions[::-1]
         axes = [self.dimensions.index(dim) for dim in dimensions]
         data = self.data.transpose(*axes)
-        return type(self)(dimensions, data, self.attributes,
-                          encoding=self.encoding)
+        return type(self)(dimensions, data, self.attributes, self.encoding)
 
     def reduce(self, func, dimension=None, axis=None, **kwargs):
         """Reduce this array by applying `func` along some dimension(s).
@@ -572,13 +570,13 @@ def broadcast_xarrays(first, second):
     # adding second's dimensions at the end
     first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)]
     new_first = XArray(dimensions, first_data, first.attributes,
-                       encoding=first.encoding)
+                        first.encoding)
     # expand and reorder second_data so the dimensions line up
     first_only_dims = [d for d in dimensions if d not in second.dimensions]
     second_dims = list(second.dimensions) + first_only_dims
     second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)]
     new_second = XArray(second_dims, second_data, first.attributes,
-                        encoding=second.encoding).transpose(*dimensions)
+                        second.encoding).transpose(*dimensions)
     return new_first, new_second
 
 
diff --git a/test/test_dataset.py b/test/test_dataset.py
index 679cc4c04ab..dd7a46f0bf4 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -328,13 +328,14 @@ def test_to_dataframe(self):
 
 def create_masked_and_scaled_data():
     x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
-    encoding = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1,
-                'dtype': np.int16}
+    encoding = {'_FillValue': -1, 'add_offset': 10,
+                'scale_factor': np.float32(0.1), 'dtype': np.int16}
     return Dataset({'x': ('t', x, {}, encoding)})
 
 
 def create_encoded_masked_and_scaled_data():
-    attributes = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1}
+    attributes = {'_FillValue': -1, 'add_offset': 10,
+                  'scale_factor': np.float32(0.1)}
     return Dataset({'x': XArray('t', [-1, -1, 0, 1, 2], attributes)})
 
 
@@ -363,26 +364,20 @@ def test_roundtrip_string_data(self):
         self.assertDatasetEqual(expected, actual)
 
     def test_roundtrip_mask_and_scale(self):
-        expected = create_masked_and_scaled_data()
-        actual = self.roundtrip(expected)
-        self.assertDatasetEqual(expected, actual)
-
-    def test_roundtrip_mask_and_scale_False(self):
-        expected = create_masked_and_scaled_data()
-        actual = self.roundtrip(expected, decode_cf=False)
-        self.assertDatasetEqual(expected, actual)
+        decoded = create_masked_and_scaled_data()
+        encoded = create_encoded_masked_and_scaled_data()
+        self.assertDatasetEqual(decoded, self.roundtrip(decoded))
+        self.assertDatasetEqual(encoded,
+                                self.roundtrip(decoded, decode_cf=False))
+        self.assertDatasetEqual(decoded, self.roundtrip(encoded))
+        self.assertDatasetEqual(encoded,
+                                self.roundtrip(encoded, decode_cf=False))
 
     def test_roundtrip_example_1_netcdf(self):
         expected = open_dataset(os.path.join(_test_data_path, 'example_1.nc'))
         actual = self.roundtrip(expected)
         self.assertDatasetEqual(expected, actual)
 
-    def test_encoded_masked_and_scaled_data(self):
-        original = create_encoded_masked_and_scaled_data()
-        actual = self.roundtrip(original, decode_cf=True)
-        expected = create_masked_and_scaled_data()
-        self.assertDatasetEqual(expected, actual)
-
 
 class NetCDF4DataTest(DatasetIOCases, TestCase):
     def get_store(self):
@@ -418,9 +413,10 @@ def test_open_encodings(self):
         ds.close()
 
         expected = Dataset()
-        expected['time'] = ('time', pd.date_range('1999-01-05', periods=10))
-        expected['time'].encoding['units'] = units
-        expected['time'].encoding['dtype'] = np.dtype('int32')
+
+        time = pd.date_range('1999-01-05', periods=10)
+        encoding = {'units': units, 'dtype': np.dtype('int32')}
+        expected['time'] = ('time', time, {}, encoding)
 
         actual = open_dataset(tmp_file)
 
diff --git a/test/test_utils.py b/test/test_utils.py
index 646484bdc6a..d1bf68bd5e7 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -107,25 +107,25 @@ def test_ordered_dict_intersection(self):
     def test_dict_equal(self):
         x = OrderedDict()
         x['a'] = 3
-        x['b'] = np.array([1,2,3])
+        x['b'] = np.array([1, 2, 3])
         y = OrderedDict()
-        y['b'] = np.array([1.0,2.0,3.0])
+        y['b'] = np.array([1.0, 2.0, 3.0])
         y['a'] = 3
         self.assertTrue(utils.dict_equal(x, y)) # two nparrays are equal
-        y['b'] = [1,2,3] # np.array not the same as a list
+        y['b'] = [1, 2, 3] # np.array not the same as a list
         self.assertFalse(utils.dict_equal(x, y)) # nparray != list
-        x['b'] = [1.0,2.0,3.0]
-        self.assertTrue(utils.dict_equal(x,y)) # list vs. list
+        x['b'] = [1.0, 2.0, 3.0]
+        self.assertTrue(utils.dict_equal(x, y)) # list vs. list
         x['c'] = None
-        self.assertFalse(utils.dict_equal(x,y)) # new key in x
+        self.assertFalse(utils.dict_equal(x, y)) # new key in x
         x['c'] = np.nan
         y['c'] = np.nan
-        self.assertFalse(utils.dict_equal(x,y)) # as intended, nan != nan
+        self.assertFalse(utils.dict_equal(x, y)) # as intended, nan != nan
         x['c'] = np.inf
         y['c'] = np.inf
-        self.assertTrue(utils.dict_equal(x,y)) # inf == inf
+        self.assertTrue(utils.dict_equal(x, y)) # inf == inf
         y = dict(y)
-        self.assertTrue(utils.dict_equal(x,y)) # different dictionary types are fine
+        self.assertTrue(utils.dict_equal(x, y)) # different dictionary types are fine
 
     def test_frozen(self):
         x = utils.Frozen(self.x)

From c647ed6e0ab1eea408e0264074d2a3efc091ee2f Mon Sep 17 00:00:00 2001
From: Stephan Hoyer <shoyer@climate.com>
Date: Fri, 28 Feb 2014 13:41:21 -0800
Subject: [PATCH 8/8] Reverted an accidental change to utils.dict_equal

---
 src/xray/utils.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/xray/utils.py b/src/xray/utils.py
index a13812e242d..3579827ff39 100644
--- a/src/xray/utils.py
+++ b/src/xray/utils.py
@@ -287,13 +287,6 @@ def dict_equal(first, second):
                 return False
         elif v1 != v2:
             return False
-        if isinstance(v1, np.ndarray) != isinstance(v2, np.ndarray):
-            return False # one is an ndarray, other is not
-        elif (isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray)):
-            if not np.array_equal(v1, v2):
-                return False
-        elif v1 != v2:
-            return False
     return True
 
 def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq):