From d9a4c9cf3f5b0490b95b47254fa45d70d58d765a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 26 Feb 2014 22:48:00 -0800 Subject: [PATCH 1/8] Handle mask_and_scale ourselves instead of using netCDF4 This lets us NaNs instead of masked arrays to indicate missing values. --- src/xray/backends.py | 91 +++------------ src/xray/common.py | 10 +- src/xray/conventions.py | 253 +++++++++++++++++++++++++++------------- src/xray/utils.py | 2 +- src/xray/xarray.py | 26 +++-- test/__init__.py | 11 ++ test/test_dataset.py | 105 ++++++++++++----- test/test_xarray.py | 7 +- 8 files changed, 298 insertions(+), 207 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index 3a67f1affea..2b75db133a0 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -13,7 +13,8 @@ from collections import OrderedDict import xarray -import conventions +from conventions import (decode_cf_variable, encode_cf_variable, + is_valid_nc3_name, coerce_nc3_dtype) from utils import FrozenOrderedDict, Frozen, datetimeindex2num @@ -59,36 +60,6 @@ def sync(self): pass -def convert_to_cf_variable(array): - """Converts an XArray into an XArray suitable for saving as a netCDF - variable - """ - data = array.data - attributes = array.attributes.copy() - if isinstance(data, pd.DatetimeIndex): - # DatetimeIndex objects need to be encoded into numeric arrays - (data, units, calendar) = datetimeindex2num(data) - attributes['units'] = units - attributes['calendar'] = calendar - elif data.dtype == np.dtype('O'): - # Unfortunately, pandas.Index arrays often have dtype=object even if - # they were created from an array with a sensible datatype (e.g., - # pandas.Float64Index always has dtype=object for some reason). Because - # we allow for doing math with coordinates, these object arrays can - # propagate onward to other variables, which is why we don't only apply - # this check to XArrays with data that is a pandas.Index. - dtype = np.array(data.reshape(-1)[0]).dtype - # N.B. the "astype" call will fail if data cannot be cast to the type - # of its first element (which is probably the only sensible thing to - # do). - data = np.asarray(data).astype(dtype) - return xarray.XArray(array.dimensions, data, attributes) - - -def convert_scipy_variable(var): - return xarray.XArray(var.dimensions, var.data, var._attributes) - - class ScipyDataStore(AbstractDataStore): """ Stores data using the scipy.io.netcdf package. @@ -101,7 +72,8 @@ def __init__(self, fobj, *args, **kwdargs): @property def variables(self): - return FrozenOrderedDict((k, convert_scipy_variable(v)) + return FrozenOrderedDict((k, decode_cf_variable(v.dimensions, v.data, + v._attributes)) for k, v in self.ds.variables.iteritems()) @property @@ -119,30 +91,16 @@ def set_dimension(self, name, length): self.ds.createDimension(name, length) def _validate_attr_key(self, key): - if not conventions.is_valid_name(key): + if not is_valid_nc3_name(key): raise ValueError("Not a valid attribute name") def _cast_attr_value(self, value): - # Strings get special handling because netCDF treats them as - # character arrays. Everything else gets coerced to a numpy - # vector. netCDF treats scalars as 1-element vectors. Arrays of - # non-numeric type are not allowed. if isinstance(value, basestring): - # netcdf attributes should be unicode value = unicode(value) else: - try: - value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) - except: - raise ValueError("Not a valid value for a netCDF attribute") + value = coerce_nc3_dtype(np.atleast_1d(value)) if value.ndim > 1: - raise ValueError("netCDF attributes must be vectors " + - "(1-dimensional)") - value = conventions.coerce_type(value) - if str(value.dtype) not in conventions.TYPEMAP: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") + raise ValueError("netCDF attributes must be 1-dimensional") return value def set_attribute(self, key, value): @@ -150,11 +108,8 @@ def set_attribute(self, key, value): setattr(self.ds, key, self._cast_attr_value(value)) def set_variable(self, name, variable): - variable = convert_to_cf_variable(variable) - data = variable.data - dtype_convert = {'int64': 'int32', 'float64': 'float32'} - if str(data.dtype) in dtype_convert: - data = np.asarray(data, dtype=dtype_convert[str(data.dtype)]) + variable = encode_cf_variable(variable) + data = coerce_nc3_dtype(variable.data) self.ds.createVariable(name, data.dtype, variable.dimensions) scipy_var = self.ds.variables[name] scipy_var[:] = data[:] @@ -169,31 +124,18 @@ def sync(self): self.ds.flush() -def convert_nc4_variable(var): - # we don't want to see scale_factor and add_offset in the attributes - # since the netCDF4 package automatically scales the data on read. - # If we kept scale_factor and add_offset around and did this: - # - # foo = ncdf4.Dataset('foo.nc') - # ncdf4.dump(foo, 'bar.nc') - # bar = ncdf4.Dataset('bar.nc') - # - # you would find that any packed variables in the original - # netcdf file would now have been scaled twice! - attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() - if k not in ['scale_factor', 'add_offset']) - return xarray.XArray(var.dimensions, var, attr, indexing_mode='orthogonal') - - class NetCDF4DataStore(AbstractDataStore): def __init__(self, filename, *args, **kwdargs): - # TODO: set auto_maskandscale=True so we can handle the array - # packing/unpacking ourselves (using NaN instead of masked arrays) self.ds = nc4.Dataset(filename, *args, **kwdargs) @property def variables(self): - return FrozenOrderedDict((k, convert_nc4_variable(v)) + def convert_variable(var): + attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) + var.set_auto_maskandscale(False) + return decode_cf_variable(var.dimensions, var, attr, + indexing_mode='orthogonal') + return FrozenOrderedDict((k, convert_variable(v)) for k, v in self.ds.variables.iteritems()) @property @@ -217,7 +159,7 @@ def _cast_data(self, data): return data def set_variable(self, name, variable): - variable = convert_to_cf_variable(variable) + variable = encode_cf_variable(variable) # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute @@ -228,6 +170,7 @@ def set_variable(self, name, variable): dimensions=variable.dimensions, fill_value=fill_value) nc4_var = self.ds.variables[name] + nc4_var.set_auto_maskandscale(False) nc4_var[:] = variable.data[:] nc4_var.setncatts(variable.attributes) diff --git a/src/xray/common.py b/src/xray/common.py index de4035fef7e..36d7032a122 100644 --- a/src/xray/common.py +++ b/src/xray/common.py @@ -35,19 +35,19 @@ def __len__(self): return len(self._data) def __nonzero__(self): - return bool(self._data) + return bool(self.data) def __float__(self): - return float(self._data) + return float(self.data) def __int__(self): - return int(self._data) + return int(self.data) def __complex__(self): - return complex(self._data) + return complex(self.data) def __long__(self): - return long(self._data) + return long(self.data) # adapted from pandas.NDFrame # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L699 diff --git a/src/xray/conventions.py b/src/xray/conventions.py index 637aa5943b4..1d0b022ba44 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -1,41 +1,11 @@ -import numpy as np import unicodedata -NULL = '\x00' -NC_BYTE = '\x00\x00\x00\x01' -NC_CHAR = '\x00\x00\x00\x02' -NC_SHORT = '\x00\x00\x00\x03' -# netCDF-3 only supports 32-bit integers -NC_INT = '\x00\x00\x00\x04' -NC_FLOAT = '\x00\x00\x00\x05' -NC_DOUBLE = '\x00\x00\x00\x06' - -# Map between netCDF type and numpy dtype and vice versa. Due to a bug -# in the __hash__() method of numpy dtype objects (fixed in development -# release of numpy), we need to explicitly match byteorder for dict -# lookups to succeed. Here we normalize to native byte order. -# -# NC_CHAR is a special case because netCDF represents strings as -# character arrays. When NC_CHAR is encountered as the type of an -# attribute value, this TYPEMAP is not consulted and the data is read -# as a string. However, when NC_CHAR is encountered as the type of a -# variable, then the data is read is a numpy array of 1-char elements -# (equivalently, length-1 raw "strings"). There is no support for numpy -# arrays of multi-character strings. -TYPEMAP = { - # we could use np.dtype's as key/values except __hash__ comparison of - # numpy.dtype is broken in older versions of numpy. If you must compare - # and cannot upgrade, use __eq__.This bug is - # known to be fixed in numpy version 1.3 - NC_BYTE: 'int8', - NC_CHAR: '|S1', - NC_SHORT: 'int16', - NC_INT: 'int32', - NC_FLOAT: 'float32', - NC_DOUBLE: 'float64', - } -for k in TYPEMAP.keys(): - TYPEMAP[TYPEMAP[k]] = k +import numpy as np +import pandas as pd + +import xarray +import utils + # Special characters that are permitted in netCDF names except in the # 0th position of the string @@ -43,61 +13,45 @@ # The following are reserved names in CDL and may not be used as names of # variables, dimension, attributes -_reserved_names = set([ - 'byte', - 'char', - 'short', - 'ushort', - 'int', - 'uint', - 'int64', - 'uint64', - 'float' - 'real', - 'double', - 'bool', - 'string', - ]) +_reserved_names = set(['byte', 'char', 'short', 'ushort', 'int', 'uint', + 'int64', 'uint64', 'float' 'real', 'double', 'bool', + 'string']) + def pretty_print(x, numchars): """Given an object x, call x.__str__() and format the returned string so that it is numchars long, padding with trailing spaces or truncating with ellipses as necessary""" - s = str(x).rstrip(NULL) + s = str(x) if len(s) > numchars: return s[:(numchars - 3)] + '...' else: return s -def coerce_type(arr): - """Coerce a numeric data type to a type that is compatible with - netCDF-3 - netCDF-3 can not handle 64-bit integers, but on most platforms - Python integers are int64. To work around this discrepancy, this - helper function coerces int64 arrays to int32. An exception is - raised if this coercion is not safe. +def coerce_nc3_dtype(arr): + """Coerce an array to a data type that can be stored in a netCDF-3 file - netCDF-3 can not handle booleans, but booleans can be trivially - (albeit wastefully) represented as bytes. To work around this - discrepancy, this helper function coerces bool arrays to int8. + This function performs the following dtype conversions: + int64 -> int32 + float64 -> float32 + bool -> int8 + + Data is checked for equality, or equivalence with the default values of + `np.allclose`. """ - # Comparing the char attributes of numpy dtypes is inelegant, but this is - # the fastest test of equivalence that is invariant to endianness - if arr.dtype.char == 'l': # np.dtype('int64') - cast_arr = arr.astype( - np.dtype('int32').newbyteorder(arr.dtype.byteorder)) - if not (cast_arr == arr).all(): - raise ValueError("array contains integer values that " + - "are not representable as 32-bit signed integers") - return cast_arr - elif arr.dtype.char == '?': # np.dtype('bool') - # bool - cast_arr = arr.astype( - np.dtype('int8').newbyteorder(arr.dtype.byteorder)) - return cast_arr - else: - return arr + dtype = str(arr.dtype) + dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'} + if dtype in dtype_map: + new_dtype = dtype_map[dtype] + cast_arr = arr.astype(new_dtype) + if (('int' in dtype and not (cast_arr == arr).all()) + or ('float' in dtype and not np.allclose(cast_arr, arr))): + raise ValueError('could not safely cast array from dtype %s to %s' + % (dtype, new_dtype)) + arr = cast_arr + return arr + def _isalnumMUTF8(c): """Return True if the given UTF-8 encoded character is alphanumeric @@ -105,10 +59,11 @@ def _isalnumMUTF8(c): Input is not checked! """ - return (c.isalnum() or (len(c.encode('utf-8')) > 1)) + return c.isalnum() or (len(c.encode('utf-8')) > 1) + -def is_valid_name(s): - """Test whether an object can be validly converted to a netCDF +def is_valid_nc3_name(s): + """Test whether an object can be validly converted to a netCDF-3 dimension, variable or attribute name Earlier versions of the netCDF C-library reference implementation @@ -135,5 +90,137 @@ def is_valid_name(s): ('/' not in s) and (s[-1] != ' ') and (_isalnumMUTF8(s[0]) or (s[0] == '_')) and - all((_isalnumMUTF8(c) or c in _specialchars for c in s)) - ) \ No newline at end of file + all((_isalnumMUTF8(c) or c in _specialchars for c in s))) + + +class MaskedAndScaledArray(object): + """Wrapper around array-like objects to create a new indexable object where + values, when accessesed, are automatically scaled and masked according to + CF conventions for packed and missing data values + + New values are given by the formula: + original_values * scale_factor + add_offset + + Values can only be accessed via `__getitem__`: + + >>> x = _MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) + >>> x + _MaskedAndScaledArray(array([-99, -1, 0, 1, 2]), fill_value=-99, + scale_factor=0.01, add_offset=1) + >>> x[:] + array([ nan, 0.99, 1. , 1.01, 1.02] + + References + ---------- + http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html + """ + def __init__(self, array, fill_value=None, scale_factor=None, + add_offset=None): + """ + Parameters + ---------- + array : array-like + Original array of values to wrap + fill_value : number, optional + All values equal to fill_value in the original array are replaced + by NaN. + scale_factor : number, optional + Multiply entries in the original array by this number. + add_offset : number, optional + After applying scale_factor, add this number to entries in the + original array. + """ + self.array = array + self.scale_factor = scale_factor + self.add_offset = add_offset + self.fill_value = fill_value + + @property + def dtype(self): + return np.dtype('float') + + @property + def shape(self): + return self.array.shape + + @property + def size(self): + return self.array.size + + @property + def ndim(self): + return self.array.ndim + + def __len__(self): + return len(self.array) + + def __array__(self): + return self[...] + + def __getitem__(self, key): + # cast to float to insure NaN is meaningful + values = np.array(self.array[key], dtype=float, copy=True) + if self.fill_value is not None: + values[values == self.fill_value] = np.nan + if self.scale_factor is not None: + values *= self.scale_factor + if self.add_offset is not None: + values += self.add_offset + return values + + def __repr__(self): + return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r)" % + (type(self).__name__, self.array, self.fill_value, + self.scale_factor, self.add_offset)) + + +def encode_cf_variable(array): + """Converts an XArray into an XArray suitable for saving as a netCDF + variable + """ + data = array.data + attributes = array.attributes.copy() + if isinstance(data, pd.DatetimeIndex): + # DatetimeIndex objects need to be encoded into numeric arrays + (data, units, calendar) = utils.datetimeindex2num(data) + attributes['units'] = units + attributes['calendar'] = calendar + elif data.dtype == np.dtype('O'): + # Unfortunately, pandas.Index arrays often have dtype=object even if + # they were created from an array with a sensible datatype (e.g., + # pandas.Float64Index always has dtype=object for some reason). Because + # we allow for doing math with coordinates, these object arrays can + # propagate onward to other variables, which is why we don't only apply + # this check to XArrays with data that is a pandas.Index. + dtype = np.array(data.reshape(-1)[0]).dtype + # N.B. the "astype" call will fail if data cannot be cast to the type + # of its first element (which is probably the only sensible thing to + # do). + data = np.asarray(data).astype(dtype) + + # unscale/mask + if any(k in attributes for k in ['add_offset', 'scale_factor']): + data = np.array(data, dtype=float, copy=True) + if 'add_offset' in attributes: + data -= attributes['add_offset'] + if 'scale_factor' in attributes: + data /= attributes['scale_factor'] + + # restore original dtype + if 'encoded_dtype' in attributes: + data = data.astype(attributes.pop('encoded_dtype')) + + return xarray.XArray(array.dimensions, data, attributes) + + +def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy'): + attributes = attributes.copy() + attributes['encoded_dtype'] = data.dtype + + mask_and_scale_attrs = ['_FillValue', 'scale_factor', 'add_offset'] + if any(k in attributes for k in mask_and_scale_attrs): + data = MaskedAndScaledArray(data, attributes.get('_FillValue'), + attributes.get('scale_factor'), + attributes.get('add_offset')) + + return xarray.XArray(dimensions, data, attributes, indexing_mode) diff --git a/src/xray/utils.py b/src/xray/utils.py index d2a3f7c15b8..b2b0f290257 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -169,7 +169,7 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): """True if two objects have the same dimensions, attributes and data; otherwise False. - This function is necessary because `v1 == v2` for variables and dataviews + This function is necessary because `v1 == v2` for XArrays and DatasetArrays does element-wise comparisions (like numpy.ndarrays). """ if (v1.dimensions == v2.dimensions diff --git a/src/xray/xarray.py b/src/xray/xarray.py index f4c3fa718c0..e06a5a8e1c4 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -324,7 +324,8 @@ def reduce(self, func, dimension=None, axis=None, **kwargs): for dim in dimension: var = var._reduce(func, dim, **kwargs) else: - var = type(self)([], func(self.data, **kwargs), self.attributes) + var = type(self)([], func(self.data, **kwargs), + self._math_safe_attributes()) var._append_to_cell_methods(': '.join(self.dimensions) + ': ' + func.__name__) return var @@ -342,7 +343,7 @@ def _reduce(self, f, dim, **kwargs): dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) data = f(self.data, axis=axis, **kwargs) - new_var = type(self)(dims, data, self.attributes) + new_var = type(self)(dims, data, self._math_safe_attributes()) new_var._append_to_cell_methods(self.dimensions[axis] + ': ' + f.__name__) return new_var @@ -468,12 +469,16 @@ def from_stack(cls, variables, dimension='stacked_dimension', def __array_wrap__(self, obj, context=None): return type(self)(self.dimensions, obj, self.attributes) + def _math_safe_attributes(self): + return OrderedDict((k, v) for k, v in self.attributes.iteritems() + if k not in ['units', 'encoded_dtype']) + @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): return type(self)(self.dimensions, f(self.data, *args, **kwargs), - self.attributes) + self._math_safe_attributes()) return func @staticmethod @@ -486,11 +491,11 @@ def func(self, other): new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) - if hasattr(other, 'attributes'): - new_attr = utils.ordered_dict_intersection(self.attributes, - other.attributes) - else: - new_attr = self.attributes + new_attr = self._math_safe_attributes() + # TODO: reconsider handling of conflicting attributes + if hasattr(other, '_math_safe_attributes'): + new_attr = utils.ordered_dict_intersection( + new_attr, other._math_safe_attributes()) return type(self)(dims, new_data, new_attr) return func @@ -503,8 +508,9 @@ def func(self, other): raise ValueError('dimensions cannot change for in-place ' 'operations') self.data = f(self_data, other_data) - if hasattr(other, 'attributes'): - utils.remove_incompatible_items(self.attributes, other) + if hasattr(other, '_math_safe_attributes'): + utils.remove_incompatible_items( + self.attributes, other._math_safe_attributes()) return self return func diff --git a/test/__init__.py b/test/__init__.py index b4133911543..0195431894f 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -15,6 +15,17 @@ def assertXArrayNotEqual(self, v1, v2): def assertArrayEqual(self, a1, a2): assert_array_equal(a1, a2) + def assertDatasetEqual(self, d1, d2): + # this method is functionally equivalent to `assert d1 == d2`, but it + # checks each aspect of equality separately for easier debugging + self.assertEqual(sorted(d1.attributes.items()), + sorted(d2.attributes.items())) + self.assertEqual(sorted(d1.variables), sorted(d2.variables)) + for k in d1: + v1 = d1.variables[k] + v2 = d2.variables[k] + self.assertXArrayEqual(v1, v2) + class ReturnItem(object): def __getitem__(self, key): diff --git a/test/test_dataset.py b/test/test_dataset.py index b1f63ca462b..fff724df923 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -1,10 +1,10 @@ from collections import OrderedDict -from copy import deepcopy from cStringIO import StringIO import os.path import unittest import tempfile +import netCDF4 as nc4 import numpy as np import pandas as pd @@ -21,8 +21,8 @@ _testvar = sorted(_vars.keys())[0] _testdim = sorted(_dims.keys())[0] -def create_test_data(store=None): - obj = Dataset() if store is None else Dataset.load_store(store) +def create_test_data(): + obj = Dataset() obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) for k, d in sorted(_dims.items()): obj[k] = (k, np.arange(d)) @@ -37,7 +37,7 @@ def get_store(self): return backends.InMemoryDataStore() def test_repr(self): - data = create_test_data(self.get_store()) + data = create_test_data() self.assertEqual('', repr(data)) @@ -51,7 +51,7 @@ def test_init(self): Dataset({'a': var1, 'x': var3}) def test_groupby(self): - data = create_test_data(self.get_store()) + data = create_test_data() for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): self.assertEqual(data['dim1'][n], t) self.assertXArrayEqual(data['var1'][n], sub['var1']) @@ -142,7 +142,7 @@ def test_attributes(self): self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) def test_indexed_by(self): - data = create_test_data(self.get_store()) + data = create_test_data() slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} ret = data.indexed_by(**slicers) @@ -181,7 +181,7 @@ def test_indexed_by(self): self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) def test_labeled_by(self): - data = create_test_data(self.get_store()) + data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} self.assertEqual(data.indexed_by(**int_slicers), @@ -199,7 +199,7 @@ def test_labeled_by(self): time=pd.date_range('2000-01-01', periods=3))) def test_variable_indexing(self): - data = create_test_data(self.get_store()) + data = create_test_data() v = data['var1'] d1 = data['dim1'] d2 = data['dim2'] @@ -212,7 +212,7 @@ def test_variable_indexing(self): self.assertXArrayEqual(v[:3, :2], v[range(3), range(2)]) def test_select(self): - data = create_test_data(self.get_store()) + data = create_test_data() ret = data.select(_testvar) self.assertXArrayEqual(data[_testvar], ret[_testvar]) self.assertTrue(_vars.keys()[1] not in ret.variables) @@ -223,7 +223,7 @@ def test_unselect(self): pass def test_copy(self): - data = create_test_data(self.get_store()) + data = create_test_data() var = data.variables[_testvar] var.attributes['foo'] = 'hello world' var_copy = var.__deepcopy__() @@ -239,7 +239,7 @@ def test_copy(self): self.assertNotEqual(id(var.attributes), id(var_copy.attributes)) def test_rename(self): - data = create_test_data(self.get_store()) + data = create_test_data() newnames = {'var1': 'renamed_var1', 'dim2': 'renamed_dim2'} renamed = data.renamed(newnames) @@ -264,7 +264,7 @@ def test_rename(self): self.assertTrue('dim2' not in renamed.dimensions) def test_merge(self): - data = create_test_data(self.get_store()) + data = create_test_data() ds1 = data.select('var1') ds2 = data.select('var3') expected = data.select('var1', 'var3') @@ -276,7 +276,7 @@ def test_merge(self): ds1.merge(ds2.renamed({'var3': 'var1'})) def test_getitem(self): - data = create_test_data(self.get_store()) + data = create_test_data() data['time'] = ('time', np.arange(1000, dtype=np.int32), {'units': 'days since 2000-01-01'}) self.assertIsInstance(data['var1'], DatasetArray) @@ -292,7 +292,7 @@ def test_getitem(self): def test_setitem(self): # assign a variable var = XArray(['dim1'], np.random.randn(100)) - data1 = create_test_data(self.get_store()) + data1 = create_test_data() data1['A'] = var data2 = data1.copy() data2['A'] = var @@ -311,7 +311,7 @@ def test_write_store(self): store = self.get_store() expected.dump_to_store(store) actual = Dataset.load_store(store) - self.assertEquals(expected, actual) + self.assertDatasetEqual(expected, actual) def test_to_dataframe(self): x = np.random.randn(10) @@ -337,27 +337,68 @@ def test_to_dataframe(self): self.assertTrue(expected.equals(actual)) +def create_masked_and_scaled_data(): + x = np.array([np.nan, np.nan, 10, 10.1, 10.2]) + attr = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1, + 'encoded_dtype': np.int16} + return Dataset({'x': ('t', x, attr)}) + + class NetCDF4DataTest(DataTest): def get_store(self): f, self.tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) return backends.NetCDF4DataStore(self.tmp_file, mode='w') + def tearDown(self): + if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): + os.remove(self.tmp_file) + def test_dump_and_open_dataset(self): - data = create_test_data(self.get_store()) + for data in [create_test_data(), create_masked_and_scaled_data()]: + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + data.dump(tmp_file) + + expected = data.copy() + actual = open_dataset(tmp_file) + self.assertDatasetEqual(expected, actual) + os.remove(tmp_file) + + def test_mask_and_scale(self): f, tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) - data.dump(tmp_file) - expected = data.copy() - actual = open_dataset(tmp_file) - self.assertEquals(expected, actual) + nc = nc4.Dataset(tmp_file, mode='w') + nc.createDimension('t', 5) + nc.createVariable('x', 'int16', ('t',), fill_value=-1) + v = nc.variables['x'] + v.set_auto_maskandscale(False) + v.add_offset = 10 + v.scale_factor = 0.1 + v[:] = np.array([-1, -1, 0, 1, 2]) + nc.close() + + # first make sure netCDF4 reads the masked and scaled data correctly + nc = nc4.Dataset(tmp_file, mode='r') + expected = np.ma.array([-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False]) + actual = nc.variables['x'][:] + self.assertArrayEqual(expected, actual) + + def replace_nan(x): + # replace NaN with a sentinel value because xarray_equal is not + # NaN safe + x[np.isnan(x)] = -9999 + return x + + # now check xray + ds = open_dataset(tmp_file) + expected = create_masked_and_scaled_data() + self.assertDatasetEqual(replace_nan(expected['x']).dataset, + replace_nan(ds['x']).dataset) os.remove(tmp_file) - def tearDown(self): - if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): - os.remove(self.tmp_file) - class ScipyDataTest(DataTest): def get_store(self): @@ -365,12 +406,11 @@ def get_store(self): return backends.ScipyDataStore(fobj, 'w') def test_dump_and_open_dataset(self): - data = create_test_data(self.get_store()) - serialized = data.dumps() - - expected = data.copy() - actual = open_dataset(StringIO(serialized)) - self.assertEquals(expected, actual) + for data in [create_test_data(), create_masked_and_scaled_data()]: + serialized = data.dumps() + expected = data.copy() + actual = open_dataset(StringIO(serialized)) + self.assertDatasetEqual(expected, actual) def test_open_and_reopen_existing(self): data = open_dataset(os.path.join(_test_data_path, 'example_1.nc')) @@ -378,7 +418,10 @@ def test_open_and_reopen_existing(self): expected = data.copy() actual = open_dataset(StringIO(serialized)) - self.assertEquals(expected, actual) + # TODO: remove these print statements: + print data + print actual + self.assertDatasetEqual(expected, actual) def test_repr(self): # scipy.io.netcdf does not keep track of dimension order :( diff --git a/test/test_xarray.py b/test/test_xarray.py index f91642e6404..f5323db3a74 100644 --- a/test/test_xarray.py +++ b/test/test_xarray.py @@ -123,10 +123,11 @@ def test_1d_math(self): self.assertArrayEqual((x * v).data, x ** 2) self.assertArrayEqual(v - y, v - 1) self.assertArrayEqual(y - v, 1 - v) - # verify attributes + # verify math-safe attributes v2 = XArray(['x'], x, {'units': 'meters'}) - self.assertXArrayEqual(v2, +v2) - self.assertXArrayEqual(v2, 0 + v2) + self.assertXArrayEqual(v, +v2) + v3 = XArray(['x'], x, {'something': 'else'}) + self.assertXArrayEqual(v3, +v3) # binary ops with all variables self.assertArrayEqual(v + v, 2 * v) w = XArray(['x'], y, {'foo': 'bar'}) From 75bce5bb9c31e38e3599f3200e839832a90df521 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 27 Feb 2014 12:11:58 -0800 Subject: [PATCH 2/8] Save serialization info to XArray.encoding --- src/xray/backends.py | 27 +++++++++-------- src/xray/conventions.py | 54 ++++++++++++++++++++------------- src/xray/utils.py | 30 +++++++++++++++---- src/xray/xarray.py | 65 ++++++++++++++++++++++++---------------- test/test_conventions.py | 25 ++++++++++++++++ test/test_dataset.py | 9 ++---- 6 files changed, 142 insertions(+), 68 deletions(-) create mode 100644 test/test_conventions.py diff --git a/src/xray/backends.py b/src/xray/backends.py index 2b75db133a0..fa2bb1ac897 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -67,8 +67,11 @@ class ScipyDataStore(AbstractDataStore): be initialized with a StringIO object, allow for serialization. """ - def __init__(self, fobj, *args, **kwdargs): - self.ds = netcdf.netcdf_file(fobj, *args, **kwdargs) + def __init__(self, filename_or_obj, mode='r', mmap=None, version=1, + decode_mask_and_scale=True): + self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None, + version=version) + self.decode_mask_and_scale = decode_mask_and_scale @property def variables(self): @@ -125,16 +128,21 @@ def sync(self): class NetCDF4DataStore(AbstractDataStore): - def __init__(self, filename, *args, **kwdargs): - self.ds = nc4.Dataset(filename, *args, **kwdargs) + def __init__(self, filename, mode='r', clobber=True, diskless=False, + persist=False, format='NETCDF4', decode_mask_and_scale=True): + self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber, + diskless=diskless, persist=persist, + format=format) + self.decode_mask_and_scale = decode_mask_and_scale @property def variables(self): def convert_variable(var): attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) var.set_auto_maskandscale(False) - return decode_cf_variable(var.dimensions, var, attr, - indexing_mode='orthogonal') + return decode_cf_variable( + var.dimensions, var, attr, indexing_mode='orthogonal', + decode_mask_and_scale=self.decode_mask_and_scale) return FrozenOrderedDict((k, convert_variable(v)) for k, v in self.ds.variables.iteritems()) @@ -153,18 +161,13 @@ def set_dimension(self, name, length): def set_attribute(self, key, value): self.ds.setncatts({key: value}) - def _cast_data(self, data): - if isinstance(data, pd.DatetimeIndex): - data = datetimeindex2num(data) - return data - def set_variable(self, name, variable): variable = encode_cf_variable(variable) # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute # instead of setting it ourselves. - fill_value = variable.attributes.pop('_FillValue', None) + fill_value = variable.encoding.get('_FillValue') self.ds.createVariable(varname=name, datatype=variable.dtype, dimensions=variable.dimensions, diff --git a/src/xray/conventions.py b/src/xray/conventions.py index 1d0b022ba44..d14ca39fa93 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -37,16 +37,18 @@ def coerce_nc3_dtype(arr): float64 -> float32 bool -> int8 - Data is checked for equality, or equivalence with the default values of - `np.allclose`. + Data is checked for equality, or equivalence (non-NaN values) with + `np.allclose` with the default keyword arguments. """ dtype = str(arr.dtype) dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'} if dtype in dtype_map: new_dtype = dtype_map[dtype] + # TODO: raise a warning whenever casting the data-type instead? cast_arr = arr.astype(new_dtype) if (('int' in dtype and not (cast_arr == arr).all()) - or ('float' in dtype and not np.allclose(cast_arr, arr))): + or ('float' in dtype and + not utils.allclose_or_equiv(cast_arr, arr))): raise ValueError('could not safely cast array from dtype %s to %s' % (dtype, new_dtype)) arr = cast_arr @@ -199,28 +201,40 @@ def encode_cf_variable(array): data = np.asarray(data).astype(dtype) # unscale/mask - if any(k in attributes for k in ['add_offset', 'scale_factor']): + encoding = array.encoding + if any(k in encoding for k in ['add_offset', 'scale_factor']): data = np.array(data, dtype=float, copy=True) - if 'add_offset' in attributes: - data -= attributes['add_offset'] - if 'scale_factor' in attributes: - data /= attributes['scale_factor'] + if 'add_offset' in encoding: + data -= encoding['add_offset'] + attributes['add_offset'] = encoding['add_offset'] + if 'scale_factor' in encoding: + data /= encoding['scale_factor'] + attributes['add_offset'] = encoding['add_offset'] # restore original dtype - if 'encoded_dtype' in attributes: - data = data.astype(attributes.pop('encoded_dtype')) + if 'dtype' in encoding: + data = data.astype(encoding['dtype']) - return xarray.XArray(array.dimensions, data, attributes) + return xarray.XArray(array.dimensions, data, attributes, encoding) -def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy'): - attributes = attributes.copy() - attributes['encoded_dtype'] = data.dtype +def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy', + decode_mask_and_scale=True): + def pop_to(k, source, dest): + v = source.pop(k, None) + if v is not None: + dest[k] = v + return v - mask_and_scale_attrs = ['_FillValue', 'scale_factor', 'add_offset'] - if any(k in attributes for k in mask_and_scale_attrs): - data = MaskedAndScaledArray(data, attributes.get('_FillValue'), - attributes.get('scale_factor'), - attributes.get('add_offset')) + encoding = {'dtype': data.dtype} - return xarray.XArray(dimensions, data, attributes, indexing_mode) + if decode_mask_and_scale: + fill_value = pop_to('_FillValue', attributes, encoding) + scale_factor = pop_to('scale_factor', attributes, encoding) + add_offset = pop_to('add_offset', attributes, encoding) + if (fill_value is not None or scale_factor is not None + or add_offset is not None): + data = MaskedAndScaledArray(data, fill_value, scale_factor, + add_offset) + + return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding) diff --git a/src/xray/utils.py b/src/xray/utils.py index b2b0f290257..0e46c5a5830 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -165,6 +165,24 @@ def datetimeindex2num(dates, units=None, calendar=None): return (num, units, calendar) +def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8): + """Like np.allclose, but also allows values to NaN in both arrays + """ + if arr1.shape != arr2.shape: + return False + nan_indices = np.isnan(arr1) + if not (nan_indices == np.isnan(arr2)).all(): + return False + else: + if arr1.ndim > 0: + arr1 = arr1[~nan_indices] + arr2 = arr2[~nan_indices] + elif nan_indices: + # 0-d arrays can't be indexed, so just check if the value is NaN + return True + return np.allclose(arr1, arr2, rtol=rtol, atol=atol) + + def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): """True if two objects have the same dimensions, attributes and data; otherwise False. @@ -182,17 +200,19 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): # _data is not part of the public interface, so it's okay if its # missing pass - # TODO: replace this with a NaN safe version. - # see: pandas.core.common.array_equivalent + + def is_floating(arr): + return np.issubdtype(arr.dtype, float) + data1 = v1.data data2 = v2.data if hasattr(data1, 'equals'): # handle pandas.Index objects return data1.equals(data2) - elif np.issubdtype(data1.dtype, (str, object)): - return np.array_equal(data1, data2) + elif is_floating(data1) or is_floating(data2): + return allclose_or_equiv(data1, data2) else: - return np.allclose(data1, data2, rtol=rtol, atol=atol) + return np.array_equal(data1, data2) else: return False diff --git a/src/xray/xarray.py b/src/xray/xarray.py index e06a5a8e1c4..693d0ae2420 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -38,7 +38,8 @@ class XArray(AbstractArray): outside the context of its parent Dataset (if you want such a fully described object, use a DatasetArray instead). """ - def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): + def __init__(self, dims, data, attributes=None, indexing_mode='numpy', + encoding=None): """ Parameters ---------- @@ -56,9 +57,15 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): (with arrays). Two modes are supported: 'numpy' (fancy indexing like numpy.ndarray objects) and 'orthogonal' (array indexing accesses different dimensions independently, like netCDF4 - variables). Accessing data from a Array always uses orthogonal + variables). Accessing data from an XArray always uses orthogonal indexing, so `indexing_mode` tells the variable whether index lookups need to be internally converted to numpy-style indexing. + encoding : dict_like or None, optional + Dictionary specifying how to encode this array's data into a + serialized format like netCDF4. Currently used keys (for netCDF) + include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'. + Well behaviored code to serialize an XArray should ignore + unrecognized keys in this dictionary. """ if isinstance(dims, basestring): dims = (dims,) @@ -71,6 +78,7 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): attributes = {} self._attributes = OrderedDict(attributes) self._indexing_mode = indexing_mode + self.encoding = dict({} if encoding is None else encoding) @property def data(self): @@ -149,7 +157,8 @@ def __getitem__(self, key): # return a variable with the same indexing_mode, because data should # still be the same type as _data return type(self)(dimensions, new_data, self.attributes, - indexing_mode=self._indexing_mode) + indexing_mode=self._indexing_mode, + encoding=self.encoding) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with @@ -181,7 +190,8 @@ def _copy(self, deepcopy=False): # note: # dimensions is already an immutable tuple # attributes will be copied when the new Array is created - return type(self)(self.dimensions, data, self.attributes) + return type(self)(self.dimensions, data, self.attributes, + encoding=self.encoding) def __copy__(self): return self._copy(deepcopy=False) @@ -274,7 +284,8 @@ def transpose(self, *dimensions): dimensions = self.dimensions[::-1] axes = [self.dimensions.index(dim) for dim in dimensions] data = self.data.transpose(*axes) - return type(self)(dimensions, data, self.attributes) + return type(self)(dimensions, data, self.attributes, + encoding=self.encoding) def reduce(self, func, dimension=None, axis=None, **kwargs): """Reduce this array by applying `func` along some dimension(s). @@ -325,7 +336,7 @@ def reduce(self, func, dimension=None, axis=None, **kwargs): var = var._reduce(func, dim, **kwargs) else: var = type(self)([], func(self.data, **kwargs), - self._math_safe_attributes()) + _math_safe_attributes(self.attributes)) var._append_to_cell_methods(': '.join(self.dimensions) + ': ' + func.__name__) return var @@ -343,7 +354,8 @@ def _reduce(self, f, dim, **kwargs): dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) data = f(self.data, axis=axis, **kwargs) - new_var = type(self)(dims, data, self._math_safe_attributes()) + new_var = type(self)(dims, data, + _math_safe_attributes(self.attributes)) new_var._append_to_cell_methods(self.dimensions[axis] + ': ' + f.__name__) return new_var @@ -469,16 +481,12 @@ def from_stack(cls, variables, dimension='stacked_dimension', def __array_wrap__(self, obj, context=None): return type(self)(self.dimensions, obj, self.attributes) - def _math_safe_attributes(self): - return OrderedDict((k, v) for k, v in self.attributes.iteritems() - if k not in ['units', 'encoded_dtype']) - @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): return type(self)(self.dimensions, f(self.data, *args, **kwargs), - self._math_safe_attributes()) + _math_safe_attributes(self.attributes)) return func @staticmethod @@ -491,11 +499,11 @@ def func(self, other): new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) - new_attr = self._math_safe_attributes() + new_attr = _math_safe_attributes(self.attributes) # TODO: reconsider handling of conflicting attributes - if hasattr(other, '_math_safe_attributes'): + if hasattr(other, 'attributes'): new_attr = utils.ordered_dict_intersection( - new_attr, other._math_safe_attributes()) + new_attr, _math_safe_attributes(other.attributes)) return type(self)(dims, new_data, new_attr) return func @@ -508,27 +516,32 @@ def func(self, other): raise ValueError('dimensions cannot change for in-place ' 'operations') self.data = f(self_data, other_data) - if hasattr(other, '_math_safe_attributes'): + if hasattr(other, 'attributes'): utils.remove_incompatible_items( - self.attributes, other._math_safe_attributes()) + self.attributes, _math_safe_attributes(other.attributes)) return self return func ops.inject_special_operations(XArray) +def _math_safe_attributes(attributes): + return OrderedDict((k, v) for k, v in attributes.iteritems() + if k not in ['units']) + + def broadcast_xarrays(first, second): """Given two XArrays, return two AXrrays with matching dimensions and numpy broadcast compatible data. Parameters ---------- - first, second : Array - Array objects to broadcast. + first, second : XArray + XArray objects to broadcast. Returns ------- - first_broadcast, second_broadcast : Array + first_broadcast, second_broadcast : XArray Broadcast arrays. The data on each variable will be a view of the data on the corresponding original arrays, but dimensions will be reordered and inserted so that both broadcast arrays have the same @@ -558,21 +571,23 @@ def broadcast_xarrays(first, second): # expand first_data's dimensions so it's broadcast compatible after # adding second's dimensions at the end first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] - new_first = XArray(dimensions, first_data, first.attributes) + new_first = XArray(dimensions, first_data, first.attributes, + encoding=first.encoding) # expand and reorder second_data so the dimensions line up first_only_dims = [d for d in dimensions if d not in second.dimensions] second_dims = list(second.dimensions) + first_only_dims second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] - new_second = XArray(second_dims, second_data, first.attributes - ).transpose(*dimensions) + new_second = XArray(second_dims, second_data, first.attributes, + encoding=second.encoding).transpose(*dimensions) return new_first, new_second def _broadcast_xarray_data(self, other): if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') - elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): - # `other` satisfies the xray.Array API + elif all(hasattr(other, attr) for attr + in ['dimensions', 'data', 'shape', 'encoding']): + # `other` satisfies the necessary xray.Array API for broadcast_xarrays new_self, new_other = broadcast_xarrays(self, other) self_data = new_self.data other_data = new_other.data diff --git a/test/test_conventions.py b/test/test_conventions.py new file mode 100644 index 00000000000..58daf09eae5 --- /dev/null +++ b/test/test_conventions.py @@ -0,0 +1,25 @@ +import numpy as np + +from xray.conventions import MaskedAndScaledArray +from . import TestCase + + +class TestMaskedAndScaledArray(TestCase): + def test(self): + x = MaskedAndScaledArray(np.arange(3), fill_value=0) + self.assertEqual(x.dtype, np.dtype('float')) + self.assertEqual(x.shape, (3,)) + self.assertEqual(x.size, 3) + self.assertEqual(x.ndim, 1) + self.assertEqual(len(x), 3) + self.assertArrayEqual([np.nan, 1, 2], x) + + x = MaskedAndScaledArray(np.arange(3), add_offset=1) + self.assertArrayEqual(np.arange(3) + 1, x) + + x = MaskedAndScaledArray(np.arange(3), scale_factor=2) + self.assertArrayEqual(2 * np.arange(3), x) + + x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) + expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) + self.assertArrayEqual(expected, x) diff --git a/test/test_dataset.py b/test/test_dataset.py index fff724df923..28029a83590 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -339,9 +339,9 @@ def test_to_dataframe(self): def create_masked_and_scaled_data(): x = np.array([np.nan, np.nan, 10, 10.1, 10.2]) - attr = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1, - 'encoded_dtype': np.int16} - return Dataset({'x': ('t', x, attr)}) + encoding = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1, + 'dtype': np.int16} + return Dataset({'x': ('t', x, {}, encoding)}) class NetCDF4DataTest(DataTest): @@ -418,9 +418,6 @@ def test_open_and_reopen_existing(self): expected = data.copy() actual = open_dataset(StringIO(serialized)) - # TODO: remove these print statements: - print data - print actual self.assertDatasetEqual(expected, actual) def test_repr(self): From 8cc82580e83762d42c48a32bd378daf472df528b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 27 Feb 2014 12:59:27 -0800 Subject: [PATCH 3/8] Changes in response to github comments --- src/xray/backends.py | 10 +++++----- src/xray/conventions.py | 32 +++++++++++++++++++------------- src/xray/utils.py | 17 ++++++++--------- 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index fa2bb1ac897..3d60a30e400 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -68,10 +68,10 @@ class ScipyDataStore(AbstractDataStore): serialization. """ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1, - decode_mask_and_scale=True): + mask_and_scale=True): self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None, version=version) - self.decode_mask_and_scale = decode_mask_and_scale + self.mask_and_scale = mask_and_scale @property def variables(self): @@ -129,11 +129,11 @@ def sync(self): class NetCDF4DataStore(AbstractDataStore): def __init__(self, filename, mode='r', clobber=True, diskless=False, - persist=False, format='NETCDF4', decode_mask_and_scale=True): + persist=False, format='NETCDF4', mask_and_scale=True): self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber, diskless=diskless, persist=persist, format=format) - self.decode_mask_and_scale = decode_mask_and_scale + self.mask_and_scale = mask_and_scale @property def variables(self): @@ -142,7 +142,7 @@ def convert_variable(var): var.set_auto_maskandscale(False) return decode_cf_variable( var.dimensions, var, attr, indexing_mode='orthogonal', - decode_mask_and_scale=self.decode_mask_and_scale) + mask_and_scale=self.mask_and_scale) return FrozenOrderedDict((k, convert_variable(v)) for k, v in self.ds.variables.iteritems()) diff --git a/src/xray/conventions.py b/src/xray/conventions.py index d14ca39fa93..b645fb1d418 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -17,6 +17,10 @@ 'int64', 'uint64', 'float' 'real', 'double', 'bool', 'string']) +# These data-types aren't supported by netCDF3, so they are automatically +# coerced instead as indicated by the "coerce_nc3_dtype" function +_nc3_dtype_coercions = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'} + def pretty_print(x, numchars): """Given an object x, call x.__str__() and format the returned @@ -41,9 +45,8 @@ def coerce_nc3_dtype(arr): `np.allclose` with the default keyword arguments. """ dtype = str(arr.dtype) - dtype_map = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'} - if dtype in dtype_map: - new_dtype = dtype_map[dtype] + if dtype in _nc3_dtype_coercions: + new_dtype = _nc3_dtype_coercions[dtype] # TODO: raise a warning whenever casting the data-type instead? cast_arr = arr.astype(new_dtype) if (('int' in dtype and not (cast_arr == arr).all()) @@ -200,16 +203,19 @@ def encode_cf_variable(array): # do). data = np.asarray(data).astype(dtype) + def get_to(source, dest, k): + v = source.get(k) + dest[k] = v + return v + # unscale/mask encoding = array.encoding if any(k in encoding for k in ['add_offset', 'scale_factor']): data = np.array(data, dtype=float, copy=True) if 'add_offset' in encoding: - data -= encoding['add_offset'] - attributes['add_offset'] = encoding['add_offset'] + data -= get_to(encoding, attributes, 'add_offset') if 'scale_factor' in encoding: - data /= encoding['scale_factor'] - attributes['add_offset'] = encoding['add_offset'] + data /= get_to(encoding, attributes, 'scale_factor') # restore original dtype if 'dtype' in encoding: @@ -219,8 +225,8 @@ def encode_cf_variable(array): def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy', - decode_mask_and_scale=True): - def pop_to(k, source, dest): + mask_and_scale=True): + def pop_to(source, dest, k): v = source.pop(k, None) if v is not None: dest[k] = v @@ -228,10 +234,10 @@ def pop_to(k, source, dest): encoding = {'dtype': data.dtype} - if decode_mask_and_scale: - fill_value = pop_to('_FillValue', attributes, encoding) - scale_factor = pop_to('scale_factor', attributes, encoding) - add_offset = pop_to('add_offset', attributes, encoding) + if mask_and_scale: + fill_value = pop_to(attributes, encoding, '_FillValue') + scale_factor = pop_to(attributes, encoding, 'scale_factor') + add_offset = pop_to(attributes, encoding, 'add_offset') if (fill_value is not None or scale_factor is not None or add_offset is not None): data = MaskedAndScaledArray(data, fill_value, scale_factor, diff --git a/src/xray/utils.py b/src/xray/utils.py index 0e46c5a5830..c8cf26d26e1 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -166,21 +166,20 @@ def datetimeindex2num(dates, units=None, calendar=None): def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8): - """Like np.allclose, but also allows values to NaN in both arrays + """Like np.allclose, but also allows values to be NaN in both arrays """ if arr1.shape != arr2.shape: return False nan_indices = np.isnan(arr1) if not (nan_indices == np.isnan(arr2)).all(): return False - else: - if arr1.ndim > 0: - arr1 = arr1[~nan_indices] - arr2 = arr2[~nan_indices] - elif nan_indices: - # 0-d arrays can't be indexed, so just check if the value is NaN - return True - return np.allclose(arr1, arr2, rtol=rtol, atol=atol) + if arr1.ndim > 0: + arr1 = arr1[~nan_indices] + arr2 = arr2[~nan_indices] + elif nan_indices: + # 0-d arrays can't be indexed, so just check if the value is NaN + return True + return np.allclose(arr1, arr2, rtol=rtol, atol=atol) def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): From 4956ffcb8fec8581d6ed2bbd9065353aba5a20ce Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 27 Feb 2014 00:25:18 -0800 Subject: [PATCH 4/8] Serialize/de-serialize strings --- src/xray/backends.py | 7 ++++ src/xray/conventions.py | 73 +++++++++++++++++++++++++++++++++++++++- test/test_conventions.py | 32 +++++++++++++++++- test/test_dataset.py | 48 +++++++++++++------------- 4 files changed, 135 insertions(+), 25 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index 3d60a30e400..9bbfba74d4e 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -31,6 +31,11 @@ def set_variables(self, variables): for vn, v in variables.iteritems(): self.set_variable(vn, v) + def set_necessary_dimensions(self, variable): + for d, l in zip(variable.dimensions, variable.shape): + if d not in self.ds.dimensions: + self.set_dimension(d, l) + class InMemoryDataStore(AbstractDataStore): """ @@ -113,6 +118,7 @@ def set_attribute(self, key, value): def set_variable(self, name, variable): variable = encode_cf_variable(variable) data = coerce_nc3_dtype(variable.data) + self.set_necessary_dimensions(variable) self.ds.createVariable(name, data.dtype, variable.dimensions) scipy_var = self.ds.variables[name] scipy_var[:] = data[:] @@ -163,6 +169,7 @@ def set_attribute(self, key, value): def set_variable(self, name, variable): variable = encode_cf_variable(variable) + self.set_necessary_dimensions(variable) # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute diff --git a/src/xray/conventions.py b/src/xray/conventions.py index b645fb1d418..56f830ddd05 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -1,5 +1,6 @@ import unicodedata +import netCDF4 as nc4 import numpy as np import pandas as pd @@ -179,12 +180,74 @@ def __repr__(self): self.scale_factor, self.add_offset)) +class CharToStringArray(object): + """Wrapper around array-like objects to create a new indexable object where + values, when accessesed, are automatically concatenated along the last + dimension + + >>> CharToStringArray(np.array(['a', 'b', 'c']))[:] + array('abc', + dtype='|S3') + """ + def __init__(self, array): + """ + Parameters + ---------- + array : array-like + Original array of values to wrap. + """ + self.array = array + + @property + def dtype(self): + return np.dtype(str(self.array.dtype)[:2] + str(self.array.shape[-1])) + + @property + def shape(self): + return self.array.shape[:-1] + + @property + def size(self): + return np.prod(self.shape) + + @property + def ndim(self): + return self.array.ndim - 1 + + def __len__(self): + if self.ndim > 0: + return len(self.array) + else: + raise TypeError('len() of unsized object') + + def __str__(self): + if self.ndim == 0: + return str(self[...]) + else: + return repr(self) + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.array) + + def __array__(self): + return self[...] + + def __getitem__(self, key): + # require slicing the last dimension completely + key = utils.expanded_indexer(key, self.array.ndim) + if key[-1] != slice(None): + raise IndexError('too many indices') + return nc4.chartostring(self.array[key]) + + def encode_cf_variable(array): """Converts an XArray into an XArray suitable for saving as a netCDF variable """ + dimensions = array.dimensions data = array.data attributes = array.attributes.copy() + if isinstance(data, pd.DatetimeIndex): # DatetimeIndex objects need to be encoded into numeric arrays (data, units, calendar) = utils.datetimeindex2num(data) @@ -208,6 +271,11 @@ def get_to(source, dest, k): dest[k] = v return v + # encode strings as character arrays + if np.issubdtype(data.dtype, (str, unicode)): + data = nc4.stringtochar(data) + dimensions = dimensions + ('string%s' % data.shape[-1],) + # unscale/mask encoding = array.encoding if any(k in encoding for k in ['add_offset', 'scale_factor']): @@ -234,7 +302,10 @@ def pop_to(source, dest, k): encoding = {'dtype': data.dtype} - if mask_and_scale: + if np.issubdtype(data.dtype, (str, unicode)): + dimensions = dimensions[:-1] + data = CharToStringArray(data) + elif mask_and_scale: fill_value = pop_to(attributes, encoding, '_FillValue') scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') diff --git a/test/test_conventions.py b/test/test_conventions.py index 58daf09eae5..6a748bc6406 100644 --- a/test/test_conventions.py +++ b/test/test_conventions.py @@ -1,6 +1,6 @@ import numpy as np -from xray.conventions import MaskedAndScaledArray +from xray.conventions import MaskedAndScaledArray, CharToStringArray from . import TestCase @@ -23,3 +23,33 @@ def test(self): x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) self.assertArrayEqual(expected, x) + + +class TestCharToStringArray(TestCase): + def test(self): + array = np.array(list('abc')) + actual = CharToStringArray(array) + expected = np.array('abc') + self.assertEqual(actual.dtype, expected.dtype) + self.assertEqual(actual.shape, expected.shape) + self.assertEqual(actual.size, expected.size) + self.assertEqual(actual.ndim, expected.ndim) + with self.assertRaises(TypeError): + len(actual) + self.assertArrayEqual(expected, actual) + with self.assertRaises(IndexError): + actual[:2] + self.assertEqual(str(actual), 'abc') + + array = np.array([list('abc'), list('cdf')]) + actual = CharToStringArray(array) + expected = np.array(['abc', 'cdf']) + self.assertEqual(actual.dtype, expected.dtype) + self.assertEqual(actual.shape, expected.shape) + self.assertEqual(actual.size, expected.size) + self.assertEqual(actual.ndim, expected.ndim) + self.assertEqual(len(actual), len(expected)) + self.assertArrayEqual(expected, actual) + self.assertArrayEqual(expected[:1], actual[:1]) + with self.assertRaises(IndexError): + actual[:, :2] diff --git a/test/test_dataset.py b/test/test_dataset.py index 28029a83590..7e476c6e73e 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -36,6 +36,9 @@ class DataTest(TestCase): def get_store(self): return backends.InMemoryDataStore() + def roundtrip(self, data): + return data.copy() + def test_repr(self): data = create_test_data() self.assertEqual(' Date: Thu, 27 Feb 2014 14:23:18 -0800 Subject: [PATCH 5/8] More comprehensive Dataset read/write tests Miscelaneous bugs in mask/scale serialization also fixed. --- src/xray/backends.py | 2 +- src/xray/conventions.py | 30 ++++++++---- src/xray/dataset.py | 10 ++-- src/xray/dataset_array.py | 4 ++ test/test_dataset.py | 100 ++++++++++++++++++++++---------------- 5 files changed, 87 insertions(+), 59 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index 9bbfba74d4e..c60a071947c 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -174,7 +174,7 @@ def set_variable(self, name, variable): # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute # instead of setting it ourselves. - fill_value = variable.encoding.get('_FillValue') + fill_value = variable.attributes.pop('_FillValue', None) self.ds.createVariable(varname=name, datatype=variable.dtype, dimensions=variable.dimensions, diff --git a/src/xray/conventions.py b/src/xray/conventions.py index 56f830ddd05..ca29a922f44 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -247,6 +247,7 @@ def encode_cf_variable(array): dimensions = array.dimensions data = array.data attributes = array.attributes.copy() + encoding = array.encoding if isinstance(data, pd.DatetimeIndex): # DatetimeIndex objects need to be encoded into numeric arrays @@ -277,7 +278,6 @@ def get_to(source, dest, k): dimensions = dimensions + ('string%s' % data.shape[-1],) # unscale/mask - encoding = array.encoding if any(k in encoding for k in ['add_offset', 'scale_factor']): data = np.array(data, dtype=float, copy=True) if 'add_offset' in encoding: @@ -285,27 +285,37 @@ def get_to(source, dest, k): if 'scale_factor' in encoding: data /= get_to(encoding, attributes, 'scale_factor') + # replace NaN with the fill value + if '_FillValue' in encoding: + if encoding['_FillValue'] is np.nan: + attributes['_FillValue'] = np.nan + else: + nans = np.isnan(data) + if nans.any(): + data[nans] = get_to(encoding, attributes, '_FillValue') + # restore original dtype if 'dtype' in encoding: data = data.astype(encoding['dtype']) - return xarray.XArray(array.dimensions, data, attributes, encoding) + return xarray.XArray(dimensions, data, attributes, encoding=encoding) def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy', mask_and_scale=True): - def pop_to(source, dest, k): - v = source.pop(k, None) - if v is not None: - dest[k] = v - return v - encoding = {'dtype': data.dtype} - if np.issubdtype(data.dtype, (str, unicode)): + # TODO: add some sort of check instead of just assuming that the last + # dimension on a character array is always the string dimension dimensions = dimensions[:-1] data = CharToStringArray(data) elif mask_and_scale: + def pop_to(source, dest, k): + v = source.pop(k, None) + if v is not None: + dest[k] = v + return v + fill_value = pop_to(attributes, encoding, '_FillValue') scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') @@ -314,4 +324,6 @@ def pop_to(source, dest, k): data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset) + # TODO: decode arrays with time units into np.datetime64 objects here + return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index d4ed0c7f4a3..0291174553b 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -316,25 +316,23 @@ def noncoordinates(self): def dump_to_store(self, store): """Store dataset contents to a backends.*DataStore object.""" - store.set_dimensions(self.dimensions) store.set_variables(self.variables) store.set_attributes(self.attributes) store.sync() - def dump(self, filepath, *args, **kwdargs): + def dump(self, filepath, **kwdargs): """Dump dataset contents to a location on disk using the netCDF4 package. """ - nc4_store = backends.NetCDF4DataStore(filepath, mode='w', - *args, **kwdargs) + nc4_store = backends.NetCDF4DataStore(filepath, mode='w', **kwdargs) self.dump_to_store(nc4_store) - def dumps(self): + def dumps(self, **kwargs): """Serialize dataset contents to a string. The serialization creates an in memory netcdf version 3 string using the scipy.io.netcdf package. """ fobj = StringIO() - scipy_store = backends.ScipyDataStore(fobj, mode='w') + scipy_store = backends.ScipyDataStore(fobj, mode='w', **kwargs) self.dump_to_store(scipy_store) return fobj.getvalue() diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index a12b624b203..748718b0d2c 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -129,6 +129,10 @@ def __iter__(self): def attributes(self): return self.array.attributes + @property + def encoding(self): + return self.array.encoding + @property def variables(self): return self.dataset.variables diff --git a/test/test_dataset.py b/test/test_dataset.py index 7e476c6e73e..92ab782b8df 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -32,13 +32,7 @@ def create_test_data(): return obj -class DataTest(TestCase): - def get_store(self): - return backends.InMemoryDataStore() - - def roundtrip(self, data): - return data.copy() - +class TestDataset(TestCase): def test_repr(self): data = create_test_data() self.assertEqual(' Date: Thu, 27 Feb 2014 17:34:04 -0800 Subject: [PATCH 6/8] CF Time coordinates are handled using encodings. Datasets now have an optional constructor argument which determines whether CF-variables are converted or stored raw. --- src/xray/backends.py | 31 ++++++++--------- src/xray/conventions.py | 49 ++++++++++++++++++++------- src/xray/dataset.py | 23 +++++++------ src/xray/xarray.py | 16 ++++----- test/test_dataset.py | 73 +++++++++++++++++++++++++++++++++++------ 5 files changed, 133 insertions(+), 59 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index c60a071947c..31dfa64d3b7 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -5,17 +5,16 @@ """ # TODO: implement backend logic directly in OrderedDict subclasses, to allow # for directly manipulating Dataset.variables and the like? -import netCDF4 as nc4 import numpy as np -import pandas as pd +import netCDF4 as nc4 from scipy.io import netcdf from collections import OrderedDict import xarray -from conventions import (decode_cf_variable, encode_cf_variable, - is_valid_nc3_name, coerce_nc3_dtype) -from utils import FrozenOrderedDict, Frozen, datetimeindex2num + +from utils import FrozenOrderedDict, Frozen +from conventions import is_valid_nc3_name, coerce_nc3_dtype, encode_cf_variable class AbstractDataStore(object): @@ -72,16 +71,14 @@ class ScipyDataStore(AbstractDataStore): be initialized with a StringIO object, allow for serialization. """ - def __init__(self, filename_or_obj, mode='r', mmap=None, version=1, - mask_and_scale=True): - self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=None, + def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): + self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=mmap, version=version) - self.mask_and_scale = mask_and_scale @property def variables(self): - return FrozenOrderedDict((k, decode_cf_variable(v.dimensions, v.data, - v._attributes)) + return FrozenOrderedDict((k, xarray.XArray(v.dimensions, v.data, + v._attributes)) for k, v in self.ds.variables.iteritems()) @property @@ -134,21 +131,20 @@ def sync(self): class NetCDF4DataStore(AbstractDataStore): + def __init__(self, filename, mode='r', clobber=True, diskless=False, - persist=False, format='NETCDF4', mask_and_scale=True): + persist=False, format='NETCDF4'): self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber, diskless=diskless, persist=persist, format=format) - self.mask_and_scale = mask_and_scale @property def variables(self): def convert_variable(var): attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) var.set_auto_maskandscale(False) - return decode_cf_variable( - var.dimensions, var, attr, indexing_mode='orthogonal', - mask_and_scale=self.mask_and_scale) + return xarray.XArray(var.dimensions, var, + attr, indexing_mode='orthogonal') return FrozenOrderedDict((k, convert_variable(v)) for k, v in self.ds.variables.iteritems()) @@ -159,7 +155,8 @@ def attributes(self): @property def dimensions(self): - return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) + return FrozenOrderedDict((k, len(v)) + for k, v in self.ds.dimensions.iteritems()) def set_dimension(self, name, length): self.ds.createDimension(name, size=length) diff --git a/src/xray/conventions.py b/src/xray/conventions.py index ca29a922f44..cd18b988934 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -251,7 +251,9 @@ def encode_cf_variable(array): if isinstance(data, pd.DatetimeIndex): # DatetimeIndex objects need to be encoded into numeric arrays - (data, units, calendar) = utils.datetimeindex2num(data) + (data, units, calendar) = utils.datetimeindex2num(data, + units=encoding.get('units', None), + calendar=encoding.get('calendar', None)) attributes['units'] = units attributes['calendar'] = calendar elif data.dtype == np.dtype('O'): @@ -301,21 +303,36 @@ def get_to(source, dest, k): return xarray.XArray(dimensions, data, attributes, encoding=encoding) -def decode_cf_variable(dimensions, data, attributes, indexing_mode='numpy', - mask_and_scale=True): - encoding = {'dtype': data.dtype} +def decode_cf_variable(var, mask_and_scale=True): + data = var.data + dimensions = var.dimensions + attributes = var.attributes.copy() + encoding = var.encoding.copy() + indexing_mode = var._indexing_mode + + def pop_to(source, dest, k): + """ + A convenience function which pops a key k from source to dest. + None values are not passed on. If k already exists in dest an + error is raised. + """ + v = source.pop(k, None) + if v is not None: + if k in dest: + raise ValueError("Failed hard to prevent overwriting key %s" % k) + dest[k] = v + return v + + if 'dtype' in encoding: + if not var.data.dtype == encoding.dtype: + raise ValueError("Refused to overwrite dtype") + encoding['dtype'] = data.dtype if np.issubdtype(data.dtype, (str, unicode)): # TODO: add some sort of check instead of just assuming that the last # dimension on a character array is always the string dimension dimensions = dimensions[:-1] data = CharToStringArray(data) elif mask_and_scale: - def pop_to(source, dest, k): - v = source.pop(k, None) - if v is not None: - dest[k] = v - return v - fill_value = pop_to(attributes, encoding, '_FillValue') scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') @@ -323,7 +340,15 @@ def pop_to(source, dest, k): or add_offset is not None): data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset) - - # TODO: decode arrays with time units into np.datetime64 objects here + # TODO: How should multidimensional time variables be handled? + if (data.ndim == 1 and + 'units' in attributes and + 'since' in attributes['units']): + # convert times to datetime indices. We only do this if the dimension + # is one, since otherwise it can't be a coordinate. + units = pop_to(attributes, encoding, 'units') + calendar = pop_to(attributes, encoding, 'calendar') + data = utils.num2datetimeindex(data, units=units, + calendar=calendar) return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 0291174553b..926839c0cd2 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -17,7 +17,7 @@ num2date = nc4.num2date -def open_dataset(nc, *args, **kwargs): +def open_dataset(nc, decode_cf=True, *args, **kwargs): """Open the dataset given the object or path `nc`. *args and **kwargs provide format specific options @@ -32,7 +32,7 @@ def open_dataset(nc, *args, **kwargs): # If nc is a file-like object we read it using # the scipy.io.netcdf package store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset.load_store(store) + return Dataset.load_store(store, decode_cf=decode_cf) # list of attributes of pd.DatetimeIndex that are ndarrays of time info @@ -101,7 +101,7 @@ class Dataset(Mapping): Note: the size of dimensions in a dataset cannot be changed. """ - def __init__(self, variables=None, attributes=None): + def __init__(self, variables=None, attributes=None, decode_cf=False): """To load data from a file or file-like object, use the `open_dataset` function. @@ -118,6 +118,7 @@ def __init__(self, variables=None, attributes=None): attributes : dict-like, optional Global attributes to save on this dataset. """ + self._decode_cf = decode_cf self._variables = _VariablesDict() self._dimensions = OrderedDict() if variables is not None: @@ -134,18 +135,16 @@ def _as_variable(self, name, var): raise TypeError('Dataset variables must be of type ' 'DatasetArray or XArray, or a sequence of the ' 'form (dimensions, data[, attributes])') - + # this will unmask and rescale the data as well as convert + # time variables to datetime indices. + if self._decode_cf: + var = conventions.decode_cf_variable(var) if name in var.dimensions: # convert the coordinate into a pandas.Index if var.ndim != 1: raise ValueError('a coordinate variable must be defined with ' '1-dimensional data') - attr = var.attributes - if 'units' in attr and 'since' in attr['units']: - var.data = utils.num2datetimeindex(var.data, attr.pop('units'), - attr.pop('calendar', None)) - else: - var.data = pd.Index(var.data) + var.data = pd.Index(var.data) return var def _set_variables(self, variables): @@ -169,8 +168,8 @@ def _set_variables(self, variables): self._variables.update(new_variables) @classmethod - def load_store(cls, store): - return cls(store.variables, store.attributes) + def load_store(cls, store, decode_cf=True): + return cls(store.variables, store.attributes, decode_cf) @property def variables(self): diff --git a/src/xray/xarray.py b/src/xray/xarray.py index 693d0ae2420..e0e1bd40a5a 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -1,16 +1,16 @@ import functools -import warnings -from collections import OrderedDict -from itertools import izip - import numpy as np -import conventions -import dataset -import dataset_array -import groupby +from itertools import izip +from collections import OrderedDict + import ops import utils +import dataset +import groupby +import conventions +import dataset_array + from common import AbstractArray diff --git a/test/test_dataset.py b/test/test_dataset.py index 92ab782b8df..679cc4c04ab 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -13,14 +13,15 @@ _test_data_path = os.path.join(os.path.dirname(__file__), 'data') -_dims = {'dim1':100, 'dim2':50, 'dim3':10} -_vars = {'var1':['dim1', 'dim2'], - 'var2':['dim1', 'dim2'], - 'var3':['dim3', 'dim1'], +_dims = {'dim1': 100, 'dim2': 50, 'dim3': 10} +_vars = {'var1': ['dim1', 'dim2'], + 'var2': ['dim1', 'dim2'], + 'var3': ['dim3', 'dim1'], } _testvar = sorted(_vars.keys())[0] _testdim = sorted(_dims.keys())[0] + def create_test_data(): obj = Dataset() obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) @@ -183,8 +184,7 @@ def test_labeled_by(self): loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} self.assertEqual(data.indexed_by(**int_slicers), data.labeled_by(**loc_slicers)) - data['time'] = ('time', np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) self.assertEqual(data.indexed_by(time=0), data.labeled_by(time='2000-01-01')) self.assertEqual(data.indexed_by(time=slice(10)), @@ -274,8 +274,7 @@ def test_merge(self): def test_getitem(self): data = create_test_data() - data['time'] = ('time', np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) self.assertIsInstance(data['var1'], DatasetArray) self.assertXArrayEqual(data['var1'], data.variables['var1']) self.assertItemsEqual(data['var1'].dataset.variables, @@ -370,7 +369,7 @@ def test_roundtrip_mask_and_scale(self): def test_roundtrip_mask_and_scale_False(self): expected = create_masked_and_scaled_data() - actual = self.roundtrip(expected, mask_and_scale=False) + actual = self.roundtrip(expected, decode_cf=False) self.assertDatasetEqual(expected, actual) def test_roundtrip_example_1_netcdf(self): @@ -380,7 +379,7 @@ def test_roundtrip_example_1_netcdf(self): def test_encoded_masked_and_scaled_data(self): original = create_encoded_masked_and_scaled_data() - actual = self.roundtrip(original, mask_and_scale=True) + actual = self.roundtrip(original, decode_cf=True) expected = create_masked_and_scaled_data() self.assertDatasetEqual(expected, actual) @@ -403,6 +402,60 @@ def roundtrip(self, data, **kwargs): os.remove(tmp_file) return roundtrip_data + def test_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + + ds = nc4.Dataset(tmp_file, 'w') + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + ds.close() + + expected = Dataset() + expected['time'] = ('time', pd.date_range('1999-01-05', periods=10)) + expected['time'].encoding['units'] = units + expected['time'].encoding['dtype'] = np.dtype('int32') + + actual = open_dataset(tmp_file) + + self.assertXArrayEqual(actual['time'], expected['time']) + self.assertDictEqual(actual['time'].encoding, expected['time'].encoding) + + os.remove(tmp_file) + + def test_dump_and_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + + ds = nc4.Dataset(tmp_file, 'w') + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + ds.close() + + xray_dataset = open_dataset(tmp_file) + os.remove(tmp_file) + xray_dataset.dump(tmp_file) + + ds = nc4.Dataset(tmp_file, 'r') + + self.assertEqual(ds.variables['time'].getncattr('units'), units) + self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4) + + ds.close() + os.remove(tmp_file) + def test_mask_and_scale(self): f, tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) From b9b4b0f184b3293e062df6da8cc87d208582e3a8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 28 Feb 2014 10:13:16 -0800 Subject: [PATCH 7/8] Fixed my issues with Alex's commit --- src/xray/backends.py | 2 +- src/xray/conventions.py | 11 ++++----- src/xray/dataset.py | 49 +++++++++++++++++++++++++++++------------ src/xray/utils.py | 13 +++++++++-- src/xray/xarray.py | 28 +++++++++++------------ test/test_dataset.py | 36 ++++++++++++++---------------- test/test_utils.py | 18 +++++++-------- 7 files changed, 91 insertions(+), 66 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index 31dfa64d3b7..61d0dbded88 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -78,7 +78,7 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): @property def variables(self): return FrozenOrderedDict((k, xarray.XArray(v.dimensions, v.data, - v._attributes)) + v._attributes)) for k, v in self.ds.variables.iteritems()) @property diff --git a/src/xray/conventions.py b/src/xray/conventions.py index cd18b988934..773bd5bf188 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -298,6 +298,8 @@ def get_to(source, dest, k): # restore original dtype if 'dtype' in encoding: + if np.issubdtype(encoding['dtype'], int): + data = data.round() data = data.astype(encoding['dtype']) return xarray.XArray(dimensions, data, attributes, encoding=encoding) @@ -308,7 +310,6 @@ def decode_cf_variable(var, mask_and_scale=True): dimensions = var.dimensions attributes = var.attributes.copy() encoding = var.encoding.copy() - indexing_mode = var._indexing_mode def pop_to(source, dest, k): """ @@ -324,7 +325,7 @@ def pop_to(source, dest, k): return v if 'dtype' in encoding: - if not var.data.dtype == encoding.dtype: + if var.data.dtype != encoding['dtype']: raise ValueError("Refused to overwrite dtype") encoding['dtype'] = data.dtype if np.issubdtype(data.dtype, (str, unicode)): @@ -342,8 +343,8 @@ def pop_to(source, dest, k): add_offset) # TODO: How should multidimensional time variables be handled? if (data.ndim == 1 and - 'units' in attributes and - 'since' in attributes['units']): + 'units' in attributes and + 'since' in attributes['units']): # convert times to datetime indices. We only do this if the dimension # is one, since otherwise it can't be a coordinate. units = pop_to(attributes, encoding, 'units') @@ -351,4 +352,4 @@ def pop_to(source, dest, k): data = utils.num2datetimeindex(data, units=units, calendar=calendar) - return xarray.XArray(dimensions, data, attributes, indexing_mode, encoding) + return xarray.XArray(dimensions, data, attributes, encoding=encoding) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 926839c0cd2..1544d87c6bf 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -117,17 +117,18 @@ def __init__(self, variables=None, attributes=None, decode_cf=False): `pandas.Index` objects. attributes : dict-like, optional Global attributes to save on this dataset. + decode_cf : bool, optional + Whether to decode these variables according to CF conventions. """ - self._decode_cf = decode_cf self._variables = _VariablesDict() self._dimensions = OrderedDict() if variables is not None: - self._set_variables(variables) + self.set_variables(variables, decode_cf=decode_cf) if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) - def _as_variable(self, name, var): + def _as_variable(self, name, var, decode_cf=False): if not isinstance(var, xarray.XArray): try: var = xarray.XArray(*var) @@ -137,23 +138,43 @@ def _as_variable(self, name, var): 'form (dimensions, data[, attributes])') # this will unmask and rescale the data as well as convert # time variables to datetime indices. - if self._decode_cf: + if decode_cf: var = conventions.decode_cf_variable(var) if name in var.dimensions: # convert the coordinate into a pandas.Index if var.ndim != 1: raise ValueError('a coordinate variable must be defined with ' '1-dimensional data') - var.data = pd.Index(var.data) + # create a new XArray object on which to modify the data + var = xarray.XArray(var.dimensions, pd.Index(var.data), + var.attributes, encoding=var.encoding) return var - def _set_variables(self, variables): - """Set a mapping of variables and update dimensions""" + def set_variables(self, variables, decode_cf=False): + """Set a mapping of variables and update dimensions. + + Parameters + ---------- + variables : dict-like, optional + A mapping from variable names to `XArray` objects or sequences of + the form `(dimensions, data[, attributes])` which can be used as + arguments to create a new `XArray`. Each dimension must have the + same length in all variables in which it appears. One dimensional + variables with name equal to their dimension are coordinate + variables, which means they are saved in the dataset as + `pandas.Index` objects. + decode_cf : bool, optional + Whether to decode these variables according to CF conventions. + + Returns + ------- + None + """ # save new variables into a temporary list so all the error checking # can be done before updating _variables new_variables = [] for k, var in variables.iteritems(): - var = self._as_variable(k, var) + var = self._as_variable(k, var, decode_cf=decode_cf) for dim, size in zip(var.dimensions, var.shape): if dim not in self._dimensions: self._dimensions[dim] = size @@ -169,7 +190,7 @@ def _set_variables(self, variables): @classmethod def load_store(cls, store, decode_cf=True): - return cls(store.variables, store.attributes, decode_cf) + return cls(store.variables, store.attributes, decode_cf=decode_cf) @property def variables(self): @@ -264,7 +285,7 @@ def __setitem__(self, key, value): # TODO: should remove key from this dataset if it already exists self.merge(value.renamed(key).dataset, inplace=True) else: - self._set_variables({key: value}) + self.set_variables({key: value}) def __delitem__(self, key): """Remove a variable from this dataset. @@ -480,7 +501,7 @@ def renamed(self, name_dict): #TODO: public interface for renaming a variable without loading # data? variables[name] = xarray.XArray(dims, v._data, v.attributes, - v._indexing_mode) + v.encoding, v._indexing_mode) return type(self)(variables, self.attributes) @@ -514,9 +535,9 @@ def merge(self, other, inplace=False): compat=utils.xarray_equal) # update contents obj = self if inplace else self.copy() - obj._set_variables(OrderedDict((k, v) for k, v - in other.variables.iteritems() - if k not in obj.variables)) + obj.set_variables(OrderedDict((k, v) for k, v + in other.variables.iteritems() + if k not in obj.variables)) # remove conflicting attributes for k, v in other.attributes.iteritems(): if k in self.attributes and v != self.attributes[k]: diff --git a/src/xray/utils.py b/src/xray/utils.py index c8cf26d26e1..a13812e242d 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -113,6 +113,7 @@ def num2datetimeindex(num_dates, units, calendar=None): For standard (Gregorian) calendars, this function uses vectorized operations, which makes it much faster than netCDF4.num2date. """ + # TODO: fix this function so it works on arbitrary n-dimensional arrays num_dates = np.asarray(num_dates) if calendar is None: calendar = 'standard' @@ -257,14 +258,15 @@ def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): if k in first_dict and not compat(v, first_dict[k]): del first_dict[k] + def dict_equal(first, second): - """ Test equality of two dict-like objects. If any of the values + """Test equality of two dict-like objects. If any of the values are numpy arrays, compare them for equality correctly. Parameters ---------- first, second : dict-like - dictionaries to compare for equality + Dictionaries to compare for equality Returns ------- @@ -285,6 +287,13 @@ def dict_equal(first, second): return False elif v1 != v2: return False + if isinstance(v1, np.ndarray) != isinstance(v2, np.ndarray): + return False # one is an ndarray, other is not + elif (isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray)): + if not np.array_equal(v1, v2): + return False + elif v1 != v2: + return False return True def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq): diff --git a/src/xray/xarray.py b/src/xray/xarray.py index e0e1bd40a5a..41ae20867a6 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -38,8 +38,8 @@ class XArray(AbstractArray): outside the context of its parent Dataset (if you want such a fully described object, use a DatasetArray instead). """ - def __init__(self, dims, data, attributes=None, indexing_mode='numpy', - encoding=None): + def __init__(self, dims, data, attributes=None, encoding=None, + indexing_mode='numpy'): """ Parameters ---------- @@ -52,6 +52,11 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy', attributes : dict_like or None, optional Attributes to assign to the new variable. If None (default), an empty attribute dictionary is initialized. + encoding : dict_like or None, optional + Dictionary specifying how to encode this array's data into a + serialized format like netCDF4. Currently used keys (for netCDF) + include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'. + Well behaviored code to serialize an XArray should ignore indexing_mode : {'numpy', 'orthogonal'} String indicating how the data parameter handles fancy indexing (with arrays). Two modes are supported: 'numpy' (fancy indexing @@ -60,11 +65,6 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy', variables). Accessing data from an XArray always uses orthogonal indexing, so `indexing_mode` tells the variable whether index lookups need to be internally converted to numpy-style indexing. - encoding : dict_like or None, optional - Dictionary specifying how to encode this array's data into a - serialized format like netCDF4. Currently used keys (for netCDF) - include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'. - Well behaviored code to serialize an XArray should ignore unrecognized keys in this dictionary. """ if isinstance(dims, basestring): @@ -77,8 +77,8 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy', if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) - self._indexing_mode = indexing_mode self.encoding = dict({} if encoding is None else encoding) + self._indexing_mode = indexing_mode @property def data(self): @@ -157,8 +157,7 @@ def __getitem__(self, key): # return a variable with the same indexing_mode, because data should # still be the same type as _data return type(self)(dimensions, new_data, self.attributes, - indexing_mode=self._indexing_mode, - encoding=self.encoding) + self.encoding, self._indexing_mode) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with @@ -191,7 +190,7 @@ def _copy(self, deepcopy=False): # dimensions is already an immutable tuple # attributes will be copied when the new Array is created return type(self)(self.dimensions, data, self.attributes, - encoding=self.encoding) + self.encoding) def __copy__(self): return self._copy(deepcopy=False) @@ -284,8 +283,7 @@ def transpose(self, *dimensions): dimensions = self.dimensions[::-1] axes = [self.dimensions.index(dim) for dim in dimensions] data = self.data.transpose(*axes) - return type(self)(dimensions, data, self.attributes, - encoding=self.encoding) + return type(self)(dimensions, data, self.attributes, self.encoding) def reduce(self, func, dimension=None, axis=None, **kwargs): """Reduce this array by applying `func` along some dimension(s). @@ -572,13 +570,13 @@ def broadcast_xarrays(first, second): # adding second's dimensions at the end first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] new_first = XArray(dimensions, first_data, first.attributes, - encoding=first.encoding) + first.encoding) # expand and reorder second_data so the dimensions line up first_only_dims = [d for d in dimensions if d not in second.dimensions] second_dims = list(second.dimensions) + first_only_dims second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] new_second = XArray(second_dims, second_data, first.attributes, - encoding=second.encoding).transpose(*dimensions) + second.encoding).transpose(*dimensions) return new_first, new_second diff --git a/test/test_dataset.py b/test/test_dataset.py index 679cc4c04ab..dd7a46f0bf4 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -328,13 +328,14 @@ def test_to_dataframe(self): def create_masked_and_scaled_data(): x = np.array([np.nan, np.nan, 10, 10.1, 10.2]) - encoding = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1, - 'dtype': np.int16} + encoding = {'_FillValue': -1, 'add_offset': 10, + 'scale_factor': np.float32(0.1), 'dtype': np.int16} return Dataset({'x': ('t', x, {}, encoding)}) def create_encoded_masked_and_scaled_data(): - attributes = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': 0.1} + attributes = {'_FillValue': -1, 'add_offset': 10, + 'scale_factor': np.float32(0.1)} return Dataset({'x': XArray('t', [-1, -1, 0, 1, 2], attributes)}) @@ -363,26 +364,20 @@ def test_roundtrip_string_data(self): self.assertDatasetEqual(expected, actual) def test_roundtrip_mask_and_scale(self): - expected = create_masked_and_scaled_data() - actual = self.roundtrip(expected) - self.assertDatasetEqual(expected, actual) - - def test_roundtrip_mask_and_scale_False(self): - expected = create_masked_and_scaled_data() - actual = self.roundtrip(expected, decode_cf=False) - self.assertDatasetEqual(expected, actual) + decoded = create_masked_and_scaled_data() + encoded = create_encoded_masked_and_scaled_data() + self.assertDatasetEqual(decoded, self.roundtrip(decoded)) + self.assertDatasetEqual(encoded, + self.roundtrip(decoded, decode_cf=False)) + self.assertDatasetEqual(decoded, self.roundtrip(encoded)) + self.assertDatasetEqual(encoded, + self.roundtrip(encoded, decode_cf=False)) def test_roundtrip_example_1_netcdf(self): expected = open_dataset(os.path.join(_test_data_path, 'example_1.nc')) actual = self.roundtrip(expected) self.assertDatasetEqual(expected, actual) - def test_encoded_masked_and_scaled_data(self): - original = create_encoded_masked_and_scaled_data() - actual = self.roundtrip(original, decode_cf=True) - expected = create_masked_and_scaled_data() - self.assertDatasetEqual(expected, actual) - class NetCDF4DataTest(DatasetIOCases, TestCase): def get_store(self): @@ -418,9 +413,10 @@ def test_open_encodings(self): ds.close() expected = Dataset() - expected['time'] = ('time', pd.date_range('1999-01-05', periods=10)) - expected['time'].encoding['units'] = units - expected['time'].encoding['dtype'] = np.dtype('int32') + + time = pd.date_range('1999-01-05', periods=10) + encoding = {'units': units, 'dtype': np.dtype('int32')} + expected['time'] = ('time', time, {}, encoding) actual = open_dataset(tmp_file) diff --git a/test/test_utils.py b/test/test_utils.py index 646484bdc6a..d1bf68bd5e7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -107,25 +107,25 @@ def test_ordered_dict_intersection(self): def test_dict_equal(self): x = OrderedDict() x['a'] = 3 - x['b'] = np.array([1,2,3]) + x['b'] = np.array([1, 2, 3]) y = OrderedDict() - y['b'] = np.array([1.0,2.0,3.0]) + y['b'] = np.array([1.0, 2.0, 3.0]) y['a'] = 3 self.assertTrue(utils.dict_equal(x, y)) # two nparrays are equal - y['b'] = [1,2,3] # np.array not the same as a list + y['b'] = [1, 2, 3] # np.array not the same as a list self.assertFalse(utils.dict_equal(x, y)) # nparray != list - x['b'] = [1.0,2.0,3.0] - self.assertTrue(utils.dict_equal(x,y)) # list vs. list + x['b'] = [1.0, 2.0, 3.0] + self.assertTrue(utils.dict_equal(x, y)) # list vs. list x['c'] = None - self.assertFalse(utils.dict_equal(x,y)) # new key in x + self.assertFalse(utils.dict_equal(x, y)) # new key in x x['c'] = np.nan y['c'] = np.nan - self.assertFalse(utils.dict_equal(x,y)) # as intended, nan != nan + self.assertFalse(utils.dict_equal(x, y)) # as intended, nan != nan x['c'] = np.inf y['c'] = np.inf - self.assertTrue(utils.dict_equal(x,y)) # inf == inf + self.assertTrue(utils.dict_equal(x, y)) # inf == inf y = dict(y) - self.assertTrue(utils.dict_equal(x,y)) # different dictionary types are fine + self.assertTrue(utils.dict_equal(x, y)) # different dictionary types are fine def test_frozen(self): x = utils.Frozen(self.x) From c647ed6e0ab1eea408e0264074d2a3efc091ee2f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 28 Feb 2014 13:41:21 -0800 Subject: [PATCH 8/8] Reverted an accidental change to utils.dict_equal --- src/xray/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/xray/utils.py b/src/xray/utils.py index a13812e242d..3579827ff39 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -287,13 +287,6 @@ def dict_equal(first, second): return False elif v1 != v2: return False - if isinstance(v1, np.ndarray) != isinstance(v2, np.ndarray): - return False # one is an ndarray, other is not - elif (isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray)): - if not np.array_equal(v1, v2): - return False - elif v1 != v2: - return False return True def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq):