diff --git a/src/xray/backends.py b/src/xray/backends.py index 3a67f1affea..61d0dbded88 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -5,16 +5,16 @@ """ # TODO: implement backend logic directly in OrderedDict subclasses, to allow # for directly manipulating Dataset.variables and the like? -import netCDF4 as nc4 import numpy as np -import pandas as pd +import netCDF4 as nc4 from scipy.io import netcdf from collections import OrderedDict import xarray -import conventions -from utils import FrozenOrderedDict, Frozen, datetimeindex2num + +from utils import FrozenOrderedDict, Frozen +from conventions import is_valid_nc3_name, coerce_nc3_dtype, encode_cf_variable class AbstractDataStore(object): @@ -30,6 +30,11 @@ def set_variables(self, variables): for vn, v in variables.iteritems(): self.set_variable(vn, v) + def set_necessary_dimensions(self, variable): + for d, l in zip(variable.dimensions, variable.shape): + if d not in self.ds.dimensions: + self.set_dimension(d, l) + class InMemoryDataStore(AbstractDataStore): """ @@ -59,36 +64,6 @@ def sync(self): pass -def convert_to_cf_variable(array): - """Converts an XArray into an XArray suitable for saving as a netCDF - variable - """ - data = array.data - attributes = array.attributes.copy() - if isinstance(data, pd.DatetimeIndex): - # DatetimeIndex objects need to be encoded into numeric arrays - (data, units, calendar) = datetimeindex2num(data) - attributes['units'] = units - attributes['calendar'] = calendar - elif data.dtype == np.dtype('O'): - # Unfortunately, pandas.Index arrays often have dtype=object even if - # they were created from an array with a sensible datatype (e.g., - # pandas.Float64Index always has dtype=object for some reason). Because - # we allow for doing math with coordinates, these object arrays can - # propagate onward to other variables, which is why we don't only apply - # this check to XArrays with data that is a pandas.Index. - dtype = np.array(data.reshape(-1)[0]).dtype - # N.B. the "astype" call will fail if data cannot be cast to the type - # of its first element (which is probably the only sensible thing to - # do). - data = np.asarray(data).astype(dtype) - return xarray.XArray(array.dimensions, data, attributes) - - -def convert_scipy_variable(var): - return xarray.XArray(var.dimensions, var.data, var._attributes) - - class ScipyDataStore(AbstractDataStore): """ Stores data using the scipy.io.netcdf package. @@ -96,12 +71,14 @@ class ScipyDataStore(AbstractDataStore): be initialized with a StringIO object, allow for serialization. """ - def __init__(self, fobj, *args, **kwdargs): - self.ds = netcdf.netcdf_file(fobj, *args, **kwdargs) + def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): + self.ds = netcdf.netcdf_file(filename_or_obj, mode=mode, mmap=mmap, + version=version) @property def variables(self): - return FrozenOrderedDict((k, convert_scipy_variable(v)) + return FrozenOrderedDict((k, xarray.XArray(v.dimensions, v.data, + v._attributes)) for k, v in self.ds.variables.iteritems()) @property @@ -119,30 +96,16 @@ def set_dimension(self, name, length): self.ds.createDimension(name, length) def _validate_attr_key(self, key): - if not conventions.is_valid_name(key): + if not is_valid_nc3_name(key): raise ValueError("Not a valid attribute name") def _cast_attr_value(self, value): - # Strings get special handling because netCDF treats them as - # character arrays. Everything else gets coerced to a numpy - # vector. netCDF treats scalars as 1-element vectors. Arrays of - # non-numeric type are not allowed. if isinstance(value, basestring): - # netcdf attributes should be unicode value = unicode(value) else: - try: - value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) - except: - raise ValueError("Not a valid value for a netCDF attribute") + value = coerce_nc3_dtype(np.atleast_1d(value)) if value.ndim > 1: - raise ValueError("netCDF attributes must be vectors " + - "(1-dimensional)") - value = conventions.coerce_type(value) - if str(value.dtype) not in conventions.TYPEMAP: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") + raise ValueError("netCDF attributes must be 1-dimensional") return value def set_attribute(self, key, value): @@ -150,11 +113,9 @@ def set_attribute(self, key, value): setattr(self.ds, key, self._cast_attr_value(value)) def set_variable(self, name, variable): - variable = convert_to_cf_variable(variable) - data = variable.data - dtype_convert = {'int64': 'int32', 'float64': 'float32'} - if str(data.dtype) in dtype_convert: - data = np.asarray(data, dtype=dtype_convert[str(data.dtype)]) + variable = encode_cf_variable(variable) + data = coerce_nc3_dtype(variable.data) + self.set_necessary_dimensions(variable) self.ds.createVariable(name, data.dtype, variable.dimensions) scipy_var = self.ds.variables[name] scipy_var[:] = data[:] @@ -169,31 +130,22 @@ def sync(self): self.ds.flush() -def convert_nc4_variable(var): - # we don't want to see scale_factor and add_offset in the attributes - # since the netCDF4 package automatically scales the data on read. - # If we kept scale_factor and add_offset around and did this: - # - # foo = ncdf4.Dataset('foo.nc') - # ncdf4.dump(foo, 'bar.nc') - # bar = ncdf4.Dataset('bar.nc') - # - # you would find that any packed variables in the original - # netcdf file would now have been scaled twice! - attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() - if k not in ['scale_factor', 'add_offset']) - return xarray.XArray(var.dimensions, var, attr, indexing_mode='orthogonal') - - class NetCDF4DataStore(AbstractDataStore): - def __init__(self, filename, *args, **kwdargs): - # TODO: set auto_maskandscale=True so we can handle the array - # packing/unpacking ourselves (using NaN instead of masked arrays) - self.ds = nc4.Dataset(filename, *args, **kwdargs) + + def __init__(self, filename, mode='r', clobber=True, diskless=False, + persist=False, format='NETCDF4'): + self.ds = nc4.Dataset(filename, mode=mode, clobber=clobber, + diskless=diskless, persist=persist, + format=format) @property def variables(self): - return FrozenOrderedDict((k, convert_nc4_variable(v)) + def convert_variable(var): + attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) + var.set_auto_maskandscale(False) + return xarray.XArray(var.dimensions, var, + attr, indexing_mode='orthogonal') + return FrozenOrderedDict((k, convert_variable(v)) for k, v in self.ds.variables.iteritems()) @property @@ -203,7 +155,8 @@ def attributes(self): @property def dimensions(self): - return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) + return FrozenOrderedDict((k, len(v)) + for k, v in self.ds.dimensions.iteritems()) def set_dimension(self, name, length): self.ds.createDimension(name, size=length) @@ -211,13 +164,9 @@ def set_dimension(self, name, length): def set_attribute(self, key, value): self.ds.setncatts({key: value}) - def _cast_data(self, data): - if isinstance(data, pd.DatetimeIndex): - data = datetimeindex2num(data) - return data - def set_variable(self, name, variable): - variable = convert_to_cf_variable(variable) + variable = encode_cf_variable(variable) + self.set_necessary_dimensions(variable) # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute @@ -228,6 +177,7 @@ def set_variable(self, name, variable): dimensions=variable.dimensions, fill_value=fill_value) nc4_var = self.ds.variables[name] + nc4_var.set_auto_maskandscale(False) nc4_var[:] = variable.data[:] nc4_var.setncatts(variable.attributes) diff --git a/src/xray/common.py b/src/xray/common.py index de4035fef7e..36d7032a122 100644 --- a/src/xray/common.py +++ b/src/xray/common.py @@ -35,19 +35,19 @@ def __len__(self): return len(self._data) def __nonzero__(self): - return bool(self._data) + return bool(self.data) def __float__(self): - return float(self._data) + return float(self.data) def __int__(self): - return int(self._data) + return int(self.data) def __complex__(self): - return complex(self._data) + return complex(self.data) def __long__(self): - return long(self._data) + return long(self.data) # adapted from pandas.NDFrame # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L699 diff --git a/src/xray/conventions.py b/src/xray/conventions.py index 637aa5943b4..773bd5bf188 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -1,41 +1,12 @@ -import numpy as np import unicodedata -NULL = '\x00' -NC_BYTE = '\x00\x00\x00\x01' -NC_CHAR = '\x00\x00\x00\x02' -NC_SHORT = '\x00\x00\x00\x03' -# netCDF-3 only supports 32-bit integers -NC_INT = '\x00\x00\x00\x04' -NC_FLOAT = '\x00\x00\x00\x05' -NC_DOUBLE = '\x00\x00\x00\x06' - -# Map between netCDF type and numpy dtype and vice versa. Due to a bug -# in the __hash__() method of numpy dtype objects (fixed in development -# release of numpy), we need to explicitly match byteorder for dict -# lookups to succeed. Here we normalize to native byte order. -# -# NC_CHAR is a special case because netCDF represents strings as -# character arrays. When NC_CHAR is encountered as the type of an -# attribute value, this TYPEMAP is not consulted and the data is read -# as a string. However, when NC_CHAR is encountered as the type of a -# variable, then the data is read is a numpy array of 1-char elements -# (equivalently, length-1 raw "strings"). There is no support for numpy -# arrays of multi-character strings. -TYPEMAP = { - # we could use np.dtype's as key/values except __hash__ comparison of - # numpy.dtype is broken in older versions of numpy. If you must compare - # and cannot upgrade, use __eq__.This bug is - # known to be fixed in numpy version 1.3 - NC_BYTE: 'int8', - NC_CHAR: '|S1', - NC_SHORT: 'int16', - NC_INT: 'int32', - NC_FLOAT: 'float32', - NC_DOUBLE: 'float64', - } -for k in TYPEMAP.keys(): - TYPEMAP[TYPEMAP[k]] = k +import netCDF4 as nc4 +import numpy as np +import pandas as pd + +import xarray +import utils + # Special characters that are permitted in netCDF names except in the # 0th position of the string @@ -43,61 +14,50 @@ # The following are reserved names in CDL and may not be used as names of # variables, dimension, attributes -_reserved_names = set([ - 'byte', - 'char', - 'short', - 'ushort', - 'int', - 'uint', - 'int64', - 'uint64', - 'float' - 'real', - 'double', - 'bool', - 'string', - ]) +_reserved_names = set(['byte', 'char', 'short', 'ushort', 'int', 'uint', + 'int64', 'uint64', 'float' 'real', 'double', 'bool', + 'string']) + +# These data-types aren't supported by netCDF3, so they are automatically +# coerced instead as indicated by the "coerce_nc3_dtype" function +_nc3_dtype_coercions = {'int64': 'int32', 'float64': 'float32', 'bool': 'int8'} + def pretty_print(x, numchars): """Given an object x, call x.__str__() and format the returned string so that it is numchars long, padding with trailing spaces or truncating with ellipses as necessary""" - s = str(x).rstrip(NULL) + s = str(x) if len(s) > numchars: return s[:(numchars - 3)] + '...' else: return s -def coerce_type(arr): - """Coerce a numeric data type to a type that is compatible with - netCDF-3 - netCDF-3 can not handle 64-bit integers, but on most platforms - Python integers are int64. To work around this discrepancy, this - helper function coerces int64 arrays to int32. An exception is - raised if this coercion is not safe. +def coerce_nc3_dtype(arr): + """Coerce an array to a data type that can be stored in a netCDF-3 file + + This function performs the following dtype conversions: + int64 -> int32 + float64 -> float32 + bool -> int8 - netCDF-3 can not handle booleans, but booleans can be trivially - (albeit wastefully) represented as bytes. To work around this - discrepancy, this helper function coerces bool arrays to int8. + Data is checked for equality, or equivalence (non-NaN values) with + `np.allclose` with the default keyword arguments. """ - # Comparing the char attributes of numpy dtypes is inelegant, but this is - # the fastest test of equivalence that is invariant to endianness - if arr.dtype.char == 'l': # np.dtype('int64') - cast_arr = arr.astype( - np.dtype('int32').newbyteorder(arr.dtype.byteorder)) - if not (cast_arr == arr).all(): - raise ValueError("array contains integer values that " + - "are not representable as 32-bit signed integers") - return cast_arr - elif arr.dtype.char == '?': # np.dtype('bool') - # bool - cast_arr = arr.astype( - np.dtype('int8').newbyteorder(arr.dtype.byteorder)) - return cast_arr - else: - return arr + dtype = str(arr.dtype) + if dtype in _nc3_dtype_coercions: + new_dtype = _nc3_dtype_coercions[dtype] + # TODO: raise a warning whenever casting the data-type instead? + cast_arr = arr.astype(new_dtype) + if (('int' in dtype and not (cast_arr == arr).all()) + or ('float' in dtype and + not utils.allclose_or_equiv(cast_arr, arr))): + raise ValueError('could not safely cast array from dtype %s to %s' + % (dtype, new_dtype)) + arr = cast_arr + return arr + def _isalnumMUTF8(c): """Return True if the given UTF-8 encoded character is alphanumeric @@ -105,10 +65,11 @@ def _isalnumMUTF8(c): Input is not checked! """ - return (c.isalnum() or (len(c.encode('utf-8')) > 1)) + return c.isalnum() or (len(c.encode('utf-8')) > 1) -def is_valid_name(s): - """Test whether an object can be validly converted to a netCDF + +def is_valid_nc3_name(s): + """Test whether an object can be validly converted to a netCDF-3 dimension, variable or attribute name Earlier versions of the netCDF C-library reference implementation @@ -135,5 +96,260 @@ def is_valid_name(s): ('/' not in s) and (s[-1] != ' ') and (_isalnumMUTF8(s[0]) or (s[0] == '_')) and - all((_isalnumMUTF8(c) or c in _specialchars for c in s)) - ) \ No newline at end of file + all((_isalnumMUTF8(c) or c in _specialchars for c in s))) + + +class MaskedAndScaledArray(object): + """Wrapper around array-like objects to create a new indexable object where + values, when accessesed, are automatically scaled and masked according to + CF conventions for packed and missing data values + + New values are given by the formula: + original_values * scale_factor + add_offset + + Values can only be accessed via `__getitem__`: + + >>> x = _MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) + >>> x + _MaskedAndScaledArray(array([-99, -1, 0, 1, 2]), fill_value=-99, + scale_factor=0.01, add_offset=1) + >>> x[:] + array([ nan, 0.99, 1. , 1.01, 1.02] + + References + ---------- + http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html + """ + def __init__(self, array, fill_value=None, scale_factor=None, + add_offset=None): + """ + Parameters + ---------- + array : array-like + Original array of values to wrap + fill_value : number, optional + All values equal to fill_value in the original array are replaced + by NaN. + scale_factor : number, optional + Multiply entries in the original array by this number. + add_offset : number, optional + After applying scale_factor, add this number to entries in the + original array. + """ + self.array = array + self.scale_factor = scale_factor + self.add_offset = add_offset + self.fill_value = fill_value + + @property + def dtype(self): + return np.dtype('float') + + @property + def shape(self): + return self.array.shape + + @property + def size(self): + return self.array.size + + @property + def ndim(self): + return self.array.ndim + + def __len__(self): + return len(self.array) + + def __array__(self): + return self[...] + + def __getitem__(self, key): + # cast to float to insure NaN is meaningful + values = np.array(self.array[key], dtype=float, copy=True) + if self.fill_value is not None: + values[values == self.fill_value] = np.nan + if self.scale_factor is not None: + values *= self.scale_factor + if self.add_offset is not None: + values += self.add_offset + return values + + def __repr__(self): + return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r)" % + (type(self).__name__, self.array, self.fill_value, + self.scale_factor, self.add_offset)) + + +class CharToStringArray(object): + """Wrapper around array-like objects to create a new indexable object where + values, when accessesed, are automatically concatenated along the last + dimension + + >>> CharToStringArray(np.array(['a', 'b', 'c']))[:] + array('abc', + dtype='|S3') + """ + def __init__(self, array): + """ + Parameters + ---------- + array : array-like + Original array of values to wrap. + """ + self.array = array + + @property + def dtype(self): + return np.dtype(str(self.array.dtype)[:2] + str(self.array.shape[-1])) + + @property + def shape(self): + return self.array.shape[:-1] + + @property + def size(self): + return np.prod(self.shape) + + @property + def ndim(self): + return self.array.ndim - 1 + + def __len__(self): + if self.ndim > 0: + return len(self.array) + else: + raise TypeError('len() of unsized object') + + def __str__(self): + if self.ndim == 0: + return str(self[...]) + else: + return repr(self) + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.array) + + def __array__(self): + return self[...] + + def __getitem__(self, key): + # require slicing the last dimension completely + key = utils.expanded_indexer(key, self.array.ndim) + if key[-1] != slice(None): + raise IndexError('too many indices') + return nc4.chartostring(self.array[key]) + + +def encode_cf_variable(array): + """Converts an XArray into an XArray suitable for saving as a netCDF + variable + """ + dimensions = array.dimensions + data = array.data + attributes = array.attributes.copy() + encoding = array.encoding + + if isinstance(data, pd.DatetimeIndex): + # DatetimeIndex objects need to be encoded into numeric arrays + (data, units, calendar) = utils.datetimeindex2num(data, + units=encoding.get('units', None), + calendar=encoding.get('calendar', None)) + attributes['units'] = units + attributes['calendar'] = calendar + elif data.dtype == np.dtype('O'): + # Unfortunately, pandas.Index arrays often have dtype=object even if + # they were created from an array with a sensible datatype (e.g., + # pandas.Float64Index always has dtype=object for some reason). Because + # we allow for doing math with coordinates, these object arrays can + # propagate onward to other variables, which is why we don't only apply + # this check to XArrays with data that is a pandas.Index. + dtype = np.array(data.reshape(-1)[0]).dtype + # N.B. the "astype" call will fail if data cannot be cast to the type + # of its first element (which is probably the only sensible thing to + # do). + data = np.asarray(data).astype(dtype) + + def get_to(source, dest, k): + v = source.get(k) + dest[k] = v + return v + + # encode strings as character arrays + if np.issubdtype(data.dtype, (str, unicode)): + data = nc4.stringtochar(data) + dimensions = dimensions + ('string%s' % data.shape[-1],) + + # unscale/mask + if any(k in encoding for k in ['add_offset', 'scale_factor']): + data = np.array(data, dtype=float, copy=True) + if 'add_offset' in encoding: + data -= get_to(encoding, attributes, 'add_offset') + if 'scale_factor' in encoding: + data /= get_to(encoding, attributes, 'scale_factor') + + # replace NaN with the fill value + if '_FillValue' in encoding: + if encoding['_FillValue'] is np.nan: + attributes['_FillValue'] = np.nan + else: + nans = np.isnan(data) + if nans.any(): + data[nans] = get_to(encoding, attributes, '_FillValue') + + # restore original dtype + if 'dtype' in encoding: + if np.issubdtype(encoding['dtype'], int): + data = data.round() + data = data.astype(encoding['dtype']) + + return xarray.XArray(dimensions, data, attributes, encoding=encoding) + + +def decode_cf_variable(var, mask_and_scale=True): + data = var.data + dimensions = var.dimensions + attributes = var.attributes.copy() + encoding = var.encoding.copy() + + def pop_to(source, dest, k): + """ + A convenience function which pops a key k from source to dest. + None values are not passed on. If k already exists in dest an + error is raised. + """ + v = source.pop(k, None) + if v is not None: + if k in dest: + raise ValueError("Failed hard to prevent overwriting key %s" % k) + dest[k] = v + return v + + if 'dtype' in encoding: + if var.data.dtype != encoding['dtype']: + raise ValueError("Refused to overwrite dtype") + encoding['dtype'] = data.dtype + if np.issubdtype(data.dtype, (str, unicode)): + # TODO: add some sort of check instead of just assuming that the last + # dimension on a character array is always the string dimension + dimensions = dimensions[:-1] + data = CharToStringArray(data) + elif mask_and_scale: + fill_value = pop_to(attributes, encoding, '_FillValue') + scale_factor = pop_to(attributes, encoding, 'scale_factor') + add_offset = pop_to(attributes, encoding, 'add_offset') + if (fill_value is not None or scale_factor is not None + or add_offset is not None): + data = MaskedAndScaledArray(data, fill_value, scale_factor, + add_offset) + # TODO: How should multidimensional time variables be handled? + if (data.ndim == 1 and + 'units' in attributes and + 'since' in attributes['units']): + # convert times to datetime indices. We only do this if the dimension + # is one, since otherwise it can't be a coordinate. + units = pop_to(attributes, encoding, 'units') + calendar = pop_to(attributes, encoding, 'calendar') + data = utils.num2datetimeindex(data, units=units, + calendar=calendar) + + return xarray.XArray(dimensions, data, attributes, encoding=encoding) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index d4ed0c7f4a3..1544d87c6bf 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -17,7 +17,7 @@ num2date = nc4.num2date -def open_dataset(nc, *args, **kwargs): +def open_dataset(nc, decode_cf=True, *args, **kwargs): """Open the dataset given the object or path `nc`. *args and **kwargs provide format specific options @@ -32,7 +32,7 @@ def open_dataset(nc, *args, **kwargs): # If nc is a file-like object we read it using # the scipy.io.netcdf package store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset.load_store(store) + return Dataset.load_store(store, decode_cf=decode_cf) # list of attributes of pd.DatetimeIndex that are ndarrays of time info @@ -101,7 +101,7 @@ class Dataset(Mapping): Note: the size of dimensions in a dataset cannot be changed. """ - def __init__(self, variables=None, attributes=None): + def __init__(self, variables=None, attributes=None, decode_cf=False): """To load data from a file or file-like object, use the `open_dataset` function. @@ -117,16 +117,18 @@ def __init__(self, variables=None, attributes=None): `pandas.Index` objects. attributes : dict-like, optional Global attributes to save on this dataset. + decode_cf : bool, optional + Whether to decode these variables according to CF conventions. """ self._variables = _VariablesDict() self._dimensions = OrderedDict() if variables is not None: - self._set_variables(variables) + self.set_variables(variables, decode_cf=decode_cf) if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) - def _as_variable(self, name, var): + def _as_variable(self, name, var, decode_cf=False): if not isinstance(var, xarray.XArray): try: var = xarray.XArray(*var) @@ -134,27 +136,45 @@ def _as_variable(self, name, var): raise TypeError('Dataset variables must be of type ' 'DatasetArray or XArray, or a sequence of the ' 'form (dimensions, data[, attributes])') - + # this will unmask and rescale the data as well as convert + # time variables to datetime indices. + if decode_cf: + var = conventions.decode_cf_variable(var) if name in var.dimensions: # convert the coordinate into a pandas.Index if var.ndim != 1: raise ValueError('a coordinate variable must be defined with ' '1-dimensional data') - attr = var.attributes - if 'units' in attr and 'since' in attr['units']: - var.data = utils.num2datetimeindex(var.data, attr.pop('units'), - attr.pop('calendar', None)) - else: - var.data = pd.Index(var.data) + # create a new XArray object on which to modify the data + var = xarray.XArray(var.dimensions, pd.Index(var.data), + var.attributes, encoding=var.encoding) return var - def _set_variables(self, variables): - """Set a mapping of variables and update dimensions""" + def set_variables(self, variables, decode_cf=False): + """Set a mapping of variables and update dimensions. + + Parameters + ---------- + variables : dict-like, optional + A mapping from variable names to `XArray` objects or sequences of + the form `(dimensions, data[, attributes])` which can be used as + arguments to create a new `XArray`. Each dimension must have the + same length in all variables in which it appears. One dimensional + variables with name equal to their dimension are coordinate + variables, which means they are saved in the dataset as + `pandas.Index` objects. + decode_cf : bool, optional + Whether to decode these variables according to CF conventions. + + Returns + ------- + None + """ # save new variables into a temporary list so all the error checking # can be done before updating _variables new_variables = [] for k, var in variables.iteritems(): - var = self._as_variable(k, var) + var = self._as_variable(k, var, decode_cf=decode_cf) for dim, size in zip(var.dimensions, var.shape): if dim not in self._dimensions: self._dimensions[dim] = size @@ -169,8 +189,8 @@ def _set_variables(self, variables): self._variables.update(new_variables) @classmethod - def load_store(cls, store): - return cls(store.variables, store.attributes) + def load_store(cls, store, decode_cf=True): + return cls(store.variables, store.attributes, decode_cf=decode_cf) @property def variables(self): @@ -265,7 +285,7 @@ def __setitem__(self, key, value): # TODO: should remove key from this dataset if it already exists self.merge(value.renamed(key).dataset, inplace=True) else: - self._set_variables({key: value}) + self.set_variables({key: value}) def __delitem__(self, key): """Remove a variable from this dataset. @@ -316,25 +336,23 @@ def noncoordinates(self): def dump_to_store(self, store): """Store dataset contents to a backends.*DataStore object.""" - store.set_dimensions(self.dimensions) store.set_variables(self.variables) store.set_attributes(self.attributes) store.sync() - def dump(self, filepath, *args, **kwdargs): + def dump(self, filepath, **kwdargs): """Dump dataset contents to a location on disk using the netCDF4 package. """ - nc4_store = backends.NetCDF4DataStore(filepath, mode='w', - *args, **kwdargs) + nc4_store = backends.NetCDF4DataStore(filepath, mode='w', **kwdargs) self.dump_to_store(nc4_store) - def dumps(self): + def dumps(self, **kwargs): """Serialize dataset contents to a string. The serialization creates an in memory netcdf version 3 string using the scipy.io.netcdf package. """ fobj = StringIO() - scipy_store = backends.ScipyDataStore(fobj, mode='w') + scipy_store = backends.ScipyDataStore(fobj, mode='w', **kwargs) self.dump_to_store(scipy_store) return fobj.getvalue() @@ -483,7 +501,7 @@ def renamed(self, name_dict): #TODO: public interface for renaming a variable without loading # data? variables[name] = xarray.XArray(dims, v._data, v.attributes, - v._indexing_mode) + v.encoding, v._indexing_mode) return type(self)(variables, self.attributes) @@ -517,9 +535,9 @@ def merge(self, other, inplace=False): compat=utils.xarray_equal) # update contents obj = self if inplace else self.copy() - obj._set_variables(OrderedDict((k, v) for k, v - in other.variables.iteritems() - if k not in obj.variables)) + obj.set_variables(OrderedDict((k, v) for k, v + in other.variables.iteritems() + if k not in obj.variables)) # remove conflicting attributes for k, v in other.attributes.iteritems(): if k in self.attributes and v != self.attributes[k]: diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index a12b624b203..748718b0d2c 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -129,6 +129,10 @@ def __iter__(self): def attributes(self): return self.array.attributes + @property + def encoding(self): + return self.array.encoding + @property def variables(self): return self.dataset.variables diff --git a/src/xray/utils.py b/src/xray/utils.py index d2a3f7c15b8..3579827ff39 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -113,6 +113,7 @@ def num2datetimeindex(num_dates, units, calendar=None): For standard (Gregorian) calendars, this function uses vectorized operations, which makes it much faster than netCDF4.num2date. """ + # TODO: fix this function so it works on arbitrary n-dimensional arrays num_dates = np.asarray(num_dates) if calendar is None: calendar = 'standard' @@ -165,11 +166,28 @@ def datetimeindex2num(dates, units=None, calendar=None): return (num, units, calendar) +def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8): + """Like np.allclose, but also allows values to be NaN in both arrays + """ + if arr1.shape != arr2.shape: + return False + nan_indices = np.isnan(arr1) + if not (nan_indices == np.isnan(arr2)).all(): + return False + if arr1.ndim > 0: + arr1 = arr1[~nan_indices] + arr2 = arr2[~nan_indices] + elif nan_indices: + # 0-d arrays can't be indexed, so just check if the value is NaN + return True + return np.allclose(arr1, arr2, rtol=rtol, atol=atol) + + def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): """True if two objects have the same dimensions, attributes and data; otherwise False. - This function is necessary because `v1 == v2` for variables and dataviews + This function is necessary because `v1 == v2` for XArrays and DatasetArrays does element-wise comparisions (like numpy.ndarrays). """ if (v1.dimensions == v2.dimensions @@ -182,17 +200,19 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): # _data is not part of the public interface, so it's okay if its # missing pass - # TODO: replace this with a NaN safe version. - # see: pandas.core.common.array_equivalent + + def is_floating(arr): + return np.issubdtype(arr.dtype, float) + data1 = v1.data data2 = v2.data if hasattr(data1, 'equals'): # handle pandas.Index objects return data1.equals(data2) - elif np.issubdtype(data1.dtype, (str, object)): - return np.array_equal(data1, data2) + elif is_floating(data1) or is_floating(data2): + return allclose_or_equiv(data1, data2) else: - return np.allclose(data1, data2, rtol=rtol, atol=atol) + return np.array_equal(data1, data2) else: return False @@ -238,14 +258,15 @@ def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): if k in first_dict and not compat(v, first_dict[k]): del first_dict[k] + def dict_equal(first, second): - """ Test equality of two dict-like objects. If any of the values + """Test equality of two dict-like objects. If any of the values are numpy arrays, compare them for equality correctly. Parameters ---------- first, second : dict-like - dictionaries to compare for equality + Dictionaries to compare for equality Returns ------- diff --git a/src/xray/xarray.py b/src/xray/xarray.py index f4c3fa718c0..41ae20867a6 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -1,16 +1,16 @@ import functools -import warnings -from collections import OrderedDict -from itertools import izip - import numpy as np -import conventions -import dataset -import dataset_array -import groupby +from itertools import izip +from collections import OrderedDict + import ops import utils +import dataset +import groupby +import conventions +import dataset_array + from common import AbstractArray @@ -38,7 +38,8 @@ class XArray(AbstractArray): outside the context of its parent Dataset (if you want such a fully described object, use a DatasetArray instead). """ - def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): + def __init__(self, dims, data, attributes=None, encoding=None, + indexing_mode='numpy'): """ Parameters ---------- @@ -51,14 +52,20 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): attributes : dict_like or None, optional Attributes to assign to the new variable. If None (default), an empty attribute dictionary is initialized. + encoding : dict_like or None, optional + Dictionary specifying how to encode this array's data into a + serialized format like netCDF4. Currently used keys (for netCDF) + include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'. + Well behaviored code to serialize an XArray should ignore indexing_mode : {'numpy', 'orthogonal'} String indicating how the data parameter handles fancy indexing (with arrays). Two modes are supported: 'numpy' (fancy indexing like numpy.ndarray objects) and 'orthogonal' (array indexing accesses different dimensions independently, like netCDF4 - variables). Accessing data from a Array always uses orthogonal + variables). Accessing data from an XArray always uses orthogonal indexing, so `indexing_mode` tells the variable whether index lookups need to be internally converted to numpy-style indexing. + unrecognized keys in this dictionary. """ if isinstance(dims, basestring): dims = (dims,) @@ -70,6 +77,7 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) + self.encoding = dict({} if encoding is None else encoding) self._indexing_mode = indexing_mode @property @@ -149,7 +157,7 @@ def __getitem__(self, key): # return a variable with the same indexing_mode, because data should # still be the same type as _data return type(self)(dimensions, new_data, self.attributes, - indexing_mode=self._indexing_mode) + self.encoding, self._indexing_mode) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with @@ -181,7 +189,8 @@ def _copy(self, deepcopy=False): # note: # dimensions is already an immutable tuple # attributes will be copied when the new Array is created - return type(self)(self.dimensions, data, self.attributes) + return type(self)(self.dimensions, data, self.attributes, + self.encoding) def __copy__(self): return self._copy(deepcopy=False) @@ -274,7 +283,7 @@ def transpose(self, *dimensions): dimensions = self.dimensions[::-1] axes = [self.dimensions.index(dim) for dim in dimensions] data = self.data.transpose(*axes) - return type(self)(dimensions, data, self.attributes) + return type(self)(dimensions, data, self.attributes, self.encoding) def reduce(self, func, dimension=None, axis=None, **kwargs): """Reduce this array by applying `func` along some dimension(s). @@ -324,7 +333,8 @@ def reduce(self, func, dimension=None, axis=None, **kwargs): for dim in dimension: var = var._reduce(func, dim, **kwargs) else: - var = type(self)([], func(self.data, **kwargs), self.attributes) + var = type(self)([], func(self.data, **kwargs), + _math_safe_attributes(self.attributes)) var._append_to_cell_methods(': '.join(self.dimensions) + ': ' + func.__name__) return var @@ -342,7 +352,8 @@ def _reduce(self, f, dim, **kwargs): dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) data = f(self.data, axis=axis, **kwargs) - new_var = type(self)(dims, data, self.attributes) + new_var = type(self)(dims, data, + _math_safe_attributes(self.attributes)) new_var._append_to_cell_methods(self.dimensions[axis] + ': ' + f.__name__) return new_var @@ -473,7 +484,7 @@ def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): return type(self)(self.dimensions, f(self.data, *args, **kwargs), - self.attributes) + _math_safe_attributes(self.attributes)) return func @staticmethod @@ -486,11 +497,11 @@ def func(self, other): new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) + new_attr = _math_safe_attributes(self.attributes) + # TODO: reconsider handling of conflicting attributes if hasattr(other, 'attributes'): - new_attr = utils.ordered_dict_intersection(self.attributes, - other.attributes) - else: - new_attr = self.attributes + new_attr = utils.ordered_dict_intersection( + new_attr, _math_safe_attributes(other.attributes)) return type(self)(dims, new_data, new_attr) return func @@ -504,25 +515,31 @@ def func(self, other): 'operations') self.data = f(self_data, other_data) if hasattr(other, 'attributes'): - utils.remove_incompatible_items(self.attributes, other) + utils.remove_incompatible_items( + self.attributes, _math_safe_attributes(other.attributes)) return self return func ops.inject_special_operations(XArray) +def _math_safe_attributes(attributes): + return OrderedDict((k, v) for k, v in attributes.iteritems() + if k not in ['units']) + + def broadcast_xarrays(first, second): """Given two XArrays, return two AXrrays with matching dimensions and numpy broadcast compatible data. Parameters ---------- - first, second : Array - Array objects to broadcast. + first, second : XArray + XArray objects to broadcast. Returns ------- - first_broadcast, second_broadcast : Array + first_broadcast, second_broadcast : XArray Broadcast arrays. The data on each variable will be a view of the data on the corresponding original arrays, but dimensions will be reordered and inserted so that both broadcast arrays have the same @@ -552,21 +569,23 @@ def broadcast_xarrays(first, second): # expand first_data's dimensions so it's broadcast compatible after # adding second's dimensions at the end first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] - new_first = XArray(dimensions, first_data, first.attributes) + new_first = XArray(dimensions, first_data, first.attributes, + first.encoding) # expand and reorder second_data so the dimensions line up first_only_dims = [d for d in dimensions if d not in second.dimensions] second_dims = list(second.dimensions) + first_only_dims second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] - new_second = XArray(second_dims, second_data, first.attributes - ).transpose(*dimensions) + new_second = XArray(second_dims, second_data, first.attributes, + second.encoding).transpose(*dimensions) return new_first, new_second def _broadcast_xarray_data(self, other): if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') - elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): - # `other` satisfies the xray.Array API + elif all(hasattr(other, attr) for attr + in ['dimensions', 'data', 'shape', 'encoding']): + # `other` satisfies the necessary xray.Array API for broadcast_xarrays new_self, new_other = broadcast_xarrays(self, other) self_data = new_self.data other_data = new_other.data diff --git a/test/__init__.py b/test/__init__.py index b4133911543..0195431894f 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -15,6 +15,17 @@ def assertXArrayNotEqual(self, v1, v2): def assertArrayEqual(self, a1, a2): assert_array_equal(a1, a2) + def assertDatasetEqual(self, d1, d2): + # this method is functionally equivalent to `assert d1 == d2`, but it + # checks each aspect of equality separately for easier debugging + self.assertEqual(sorted(d1.attributes.items()), + sorted(d2.attributes.items())) + self.assertEqual(sorted(d1.variables), sorted(d2.variables)) + for k in d1: + v1 = d1.variables[k] + v2 = d2.variables[k] + self.assertXArrayEqual(v1, v2) + class ReturnItem(object): def __getitem__(self, key): diff --git a/test/test_conventions.py b/test/test_conventions.py new file mode 100644 index 00000000000..6a748bc6406 --- /dev/null +++ b/test/test_conventions.py @@ -0,0 +1,55 @@ +import numpy as np + +from xray.conventions import MaskedAndScaledArray, CharToStringArray +from . import TestCase + + +class TestMaskedAndScaledArray(TestCase): + def test(self): + x = MaskedAndScaledArray(np.arange(3), fill_value=0) + self.assertEqual(x.dtype, np.dtype('float')) + self.assertEqual(x.shape, (3,)) + self.assertEqual(x.size, 3) + self.assertEqual(x.ndim, 1) + self.assertEqual(len(x), 3) + self.assertArrayEqual([np.nan, 1, 2], x) + + x = MaskedAndScaledArray(np.arange(3), add_offset=1) + self.assertArrayEqual(np.arange(3) + 1, x) + + x = MaskedAndScaledArray(np.arange(3), scale_factor=2) + self.assertArrayEqual(2 * np.arange(3), x) + + x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) + expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) + self.assertArrayEqual(expected, x) + + +class TestCharToStringArray(TestCase): + def test(self): + array = np.array(list('abc')) + actual = CharToStringArray(array) + expected = np.array('abc') + self.assertEqual(actual.dtype, expected.dtype) + self.assertEqual(actual.shape, expected.shape) + self.assertEqual(actual.size, expected.size) + self.assertEqual(actual.ndim, expected.ndim) + with self.assertRaises(TypeError): + len(actual) + self.assertArrayEqual(expected, actual) + with self.assertRaises(IndexError): + actual[:2] + self.assertEqual(str(actual), 'abc') + + array = np.array([list('abc'), list('cdf')]) + actual = CharToStringArray(array) + expected = np.array(['abc', 'cdf']) + self.assertEqual(actual.dtype, expected.dtype) + self.assertEqual(actual.shape, expected.shape) + self.assertEqual(actual.size, expected.size) + self.assertEqual(actual.ndim, expected.ndim) + self.assertEqual(len(actual), len(expected)) + self.assertArrayEqual(expected, actual) + self.assertArrayEqual(expected[:1], actual[:1]) + with self.assertRaises(IndexError): + actual[:, :2] diff --git a/test/test_dataset.py b/test/test_dataset.py index b1f63ca462b..dd7a46f0bf4 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -1,10 +1,10 @@ from collections import OrderedDict -from copy import deepcopy from cStringIO import StringIO import os.path import unittest import tempfile +import netCDF4 as nc4 import numpy as np import pandas as pd @@ -13,16 +13,17 @@ _test_data_path = os.path.join(os.path.dirname(__file__), 'data') -_dims = {'dim1':100, 'dim2':50, 'dim3':10} -_vars = {'var1':['dim1', 'dim2'], - 'var2':['dim1', 'dim2'], - 'var3':['dim3', 'dim1'], +_dims = {'dim1': 100, 'dim2': 50, 'dim3': 10} +_vars = {'var1': ['dim1', 'dim2'], + 'var2': ['dim1', 'dim2'], + 'var3': ['dim3', 'dim1'], } _testvar = sorted(_vars.keys())[0] _testdim = sorted(_dims.keys())[0] -def create_test_data(store=None): - obj = Dataset() if store is None else Dataset.load_store(store) + +def create_test_data(): + obj = Dataset() obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) for k, d in sorted(_dims.items()): obj[k] = (k, np.arange(d)) @@ -32,12 +33,9 @@ def create_test_data(store=None): return obj -class DataTest(TestCase): - def get_store(self): - return backends.InMemoryDataStore() - +class TestDataset(TestCase): def test_repr(self): - data = create_test_data(self.get_store()) + data = create_test_data() self.assertEqual('', repr(data)) @@ -51,7 +49,7 @@ def test_init(self): Dataset({'a': var1, 'x': var3}) def test_groupby(self): - data = create_test_data(self.get_store()) + data = create_test_data() for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): self.assertEqual(data['dim1'][n], t) self.assertXArrayEqual(data['var1'][n], sub['var1']) @@ -142,7 +140,7 @@ def test_attributes(self): self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) def test_indexed_by(self): - data = create_test_data(self.get_store()) + data = create_test_data() slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} ret = data.indexed_by(**slicers) @@ -181,13 +179,12 @@ def test_indexed_by(self): self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) def test_labeled_by(self): - data = create_test_data(self.get_store()) + data = create_test_data() int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} self.assertEqual(data.indexed_by(**int_slicers), data.labeled_by(**loc_slicers)) - data['time'] = ('time', np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) self.assertEqual(data.indexed_by(time=0), data.labeled_by(time='2000-01-01')) self.assertEqual(data.indexed_by(time=slice(10)), @@ -199,7 +196,7 @@ def test_labeled_by(self): time=pd.date_range('2000-01-01', periods=3))) def test_variable_indexing(self): - data = create_test_data(self.get_store()) + data = create_test_data() v = data['var1'] d1 = data['dim1'] d2 = data['dim2'] @@ -212,7 +209,7 @@ def test_variable_indexing(self): self.assertXArrayEqual(v[:3, :2], v[range(3), range(2)]) def test_select(self): - data = create_test_data(self.get_store()) + data = create_test_data() ret = data.select(_testvar) self.assertXArrayEqual(data[_testvar], ret[_testvar]) self.assertTrue(_vars.keys()[1] not in ret.variables) @@ -223,7 +220,7 @@ def test_unselect(self): pass def test_copy(self): - data = create_test_data(self.get_store()) + data = create_test_data() var = data.variables[_testvar] var.attributes['foo'] = 'hello world' var_copy = var.__deepcopy__() @@ -239,7 +236,7 @@ def test_copy(self): self.assertNotEqual(id(var.attributes), id(var_copy.attributes)) def test_rename(self): - data = create_test_data(self.get_store()) + data = create_test_data() newnames = {'var1': 'renamed_var1', 'dim2': 'renamed_dim2'} renamed = data.renamed(newnames) @@ -264,7 +261,7 @@ def test_rename(self): self.assertTrue('dim2' not in renamed.dimensions) def test_merge(self): - data = create_test_data(self.get_store()) + data = create_test_data() ds1 = data.select('var1') ds2 = data.select('var3') expected = data.select('var1', 'var3') @@ -276,9 +273,8 @@ def test_merge(self): ds1.merge(ds2.renamed({'var3': 'var1'})) def test_getitem(self): - data = create_test_data(self.get_store()) - data['time'] = ('time', np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data = create_test_data() + data['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) self.assertIsInstance(data['var1'], DatasetArray) self.assertXArrayEqual(data['var1'], data.variables['var1']) self.assertItemsEqual(data['var1'].dataset.variables, @@ -292,7 +288,7 @@ def test_getitem(self): def test_setitem(self): # assign a variable var = XArray(['dim1'], np.random.randn(100)) - data1 = create_test_data(self.get_store()) + data1 = create_test_data() data1['A'] = var data2 = data1.copy() data2['A'] = var @@ -306,13 +302,6 @@ def test_setitem(self): with self.assertRaisesRegexp(TypeError, 'variables must be of type'): data2['C'] = var.data - def test_write_store(self): - expected = create_test_data() - store = self.get_store() - expected.dump_to_store(store) - actual = Dataset.load_store(store) - self.assertEquals(expected, actual) - def test_to_dataframe(self): x = np.random.randn(10) y = np.random.randn(10) @@ -337,49 +326,165 @@ def test_to_dataframe(self): self.assertTrue(expected.equals(actual)) -class NetCDF4DataTest(DataTest): +def create_masked_and_scaled_data(): + x = np.array([np.nan, np.nan, 10, 10.1, 10.2]) + encoding = {'_FillValue': -1, 'add_offset': 10, + 'scale_factor': np.float32(0.1), 'dtype': np.int16} + return Dataset({'x': ('t', x, {}, encoding)}) + + +def create_encoded_masked_and_scaled_data(): + attributes = {'_FillValue': -1, 'add_offset': 10, + 'scale_factor': np.float32(0.1)} + return Dataset({'x': XArray('t', [-1, -1, 0, 1, 2], attributes)}) + + +class DatasetIOCases(object): + def get_store(self): + raise NotImplementedError + + def roundtrip(self, data, **kwargs): + raise NotImplementedError + + def test_write_store(self): + expected = create_test_data() + store = self.get_store() + expected.dump_to_store(store) + actual = Dataset.load_store(store) + self.assertDatasetEqual(expected, actual) + + def test_roundtrip_test_data(self): + expected = create_test_data() + actual = self.roundtrip(expected) + self.assertDatasetEqual(expected, actual) + + def test_roundtrip_string_data(self): + expected = Dataset({'x': ('t', ['abc', 'def'])}) + actual = self.roundtrip(expected) + self.assertDatasetEqual(expected, actual) + + def test_roundtrip_mask_and_scale(self): + decoded = create_masked_and_scaled_data() + encoded = create_encoded_masked_and_scaled_data() + self.assertDatasetEqual(decoded, self.roundtrip(decoded)) + self.assertDatasetEqual(encoded, + self.roundtrip(decoded, decode_cf=False)) + self.assertDatasetEqual(decoded, self.roundtrip(encoded)) + self.assertDatasetEqual(encoded, + self.roundtrip(encoded, decode_cf=False)) + + def test_roundtrip_example_1_netcdf(self): + expected = open_dataset(os.path.join(_test_data_path, 'example_1.nc')) + actual = self.roundtrip(expected) + self.assertDatasetEqual(expected, actual) + + +class NetCDF4DataTest(DatasetIOCases, TestCase): def get_store(self): f, self.tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) return backends.NetCDF4DataStore(self.tmp_file, mode='w') - def test_dump_and_open_dataset(self): - data = create_test_data(self.get_store()) + def tearDown(self): + if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): + os.remove(self.tmp_file) + + def roundtrip(self, data, **kwargs): f, tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) data.dump(tmp_file) + roundtrip_data = open_dataset(tmp_file, **kwargs) + os.remove(tmp_file) + return roundtrip_data + + def test_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + + ds = nc4.Dataset(tmp_file, 'w') + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + ds.close() + + expected = Dataset() + + time = pd.date_range('1999-01-05', periods=10) + encoding = {'units': units, 'dtype': np.dtype('int32')} + expected['time'] = ('time', time, {}, encoding) - expected = data.copy() actual = open_dataset(tmp_file) - self.assertEquals(expected, actual) + + self.assertXArrayEqual(actual['time'], expected['time']) + self.assertDictEqual(actual['time'].encoding, expected['time'].encoding) + os.remove(tmp_file) - def tearDown(self): - if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): - os.remove(self.tmp_file) + def test_dump_and_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + ds = nc4.Dataset(tmp_file, 'w') + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + ds.close() -class ScipyDataTest(DataTest): - def get_store(self): - fobj = StringIO() - return backends.ScipyDataStore(fobj, 'w') + xray_dataset = open_dataset(tmp_file) + os.remove(tmp_file) + xray_dataset.dump(tmp_file) - def test_dump_and_open_dataset(self): - data = create_test_data(self.get_store()) - serialized = data.dumps() + ds = nc4.Dataset(tmp_file, 'r') - expected = data.copy() - actual = open_dataset(StringIO(serialized)) - self.assertEquals(expected, actual) + self.assertEqual(ds.variables['time'].getncattr('units'), units) + self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4) - def test_open_and_reopen_existing(self): - data = open_dataset(os.path.join(_test_data_path, 'example_1.nc')) - serialized = data.dumps() + ds.close() + os.remove(tmp_file) - expected = data.copy() - actual = open_dataset(StringIO(serialized)) - self.assertEquals(expected, actual) + def test_mask_and_scale(self): + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) - def test_repr(self): - # scipy.io.netcdf does not keep track of dimension order :( - pass + nc = nc4.Dataset(tmp_file, mode='w') + nc.createDimension('t', 5) + nc.createVariable('x', 'int16', ('t',), fill_value=-1) + v = nc.variables['x'] + v.set_auto_maskandscale(False) + v.add_offset = 10 + v.scale_factor = 0.1 + v[:] = np.array([-1, -1, 0, 1, 2]) + nc.close() + + # first make sure netCDF4 reads the masked and scaled data correctly + nc = nc4.Dataset(tmp_file, mode='r') + expected = np.ma.array([-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False]) + actual = nc.variables['x'][:] + self.assertArrayEqual(expected, actual) + + # now check xray + ds = open_dataset(tmp_file) + expected = create_masked_and_scaled_data() + self.assertDatasetEqual(expected, ds) + os.remove(tmp_file) + + +class ScipyDataTest(DatasetIOCases, TestCase): + def get_store(self): + fobj = StringIO() + return backends.ScipyDataStore(fobj, 'w') + + def roundtrip(self, data, **kwargs): + serialized = data.dumps() + return open_dataset(StringIO(serialized), **kwargs) diff --git a/test/test_utils.py b/test/test_utils.py index 646484bdc6a..d1bf68bd5e7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -107,25 +107,25 @@ def test_ordered_dict_intersection(self): def test_dict_equal(self): x = OrderedDict() x['a'] = 3 - x['b'] = np.array([1,2,3]) + x['b'] = np.array([1, 2, 3]) y = OrderedDict() - y['b'] = np.array([1.0,2.0,3.0]) + y['b'] = np.array([1.0, 2.0, 3.0]) y['a'] = 3 self.assertTrue(utils.dict_equal(x, y)) # two nparrays are equal - y['b'] = [1,2,3] # np.array not the same as a list + y['b'] = [1, 2, 3] # np.array not the same as a list self.assertFalse(utils.dict_equal(x, y)) # nparray != list - x['b'] = [1.0,2.0,3.0] - self.assertTrue(utils.dict_equal(x,y)) # list vs. list + x['b'] = [1.0, 2.0, 3.0] + self.assertTrue(utils.dict_equal(x, y)) # list vs. list x['c'] = None - self.assertFalse(utils.dict_equal(x,y)) # new key in x + self.assertFalse(utils.dict_equal(x, y)) # new key in x x['c'] = np.nan y['c'] = np.nan - self.assertFalse(utils.dict_equal(x,y)) # as intended, nan != nan + self.assertFalse(utils.dict_equal(x, y)) # as intended, nan != nan x['c'] = np.inf y['c'] = np.inf - self.assertTrue(utils.dict_equal(x,y)) # inf == inf + self.assertTrue(utils.dict_equal(x, y)) # inf == inf y = dict(y) - self.assertTrue(utils.dict_equal(x,y)) # different dictionary types are fine + self.assertTrue(utils.dict_equal(x, y)) # different dictionary types are fine def test_frozen(self): x = utils.Frozen(self.x) diff --git a/test/test_xarray.py b/test/test_xarray.py index f91642e6404..f5323db3a74 100644 --- a/test/test_xarray.py +++ b/test/test_xarray.py @@ -123,10 +123,11 @@ def test_1d_math(self): self.assertArrayEqual((x * v).data, x ** 2) self.assertArrayEqual(v - y, v - 1) self.assertArrayEqual(y - v, 1 - v) - # verify attributes + # verify math-safe attributes v2 = XArray(['x'], x, {'units': 'meters'}) - self.assertXArrayEqual(v2, +v2) - self.assertXArrayEqual(v2, 0 + v2) + self.assertXArrayEqual(v, +v2) + v3 = XArray(['x'], x, {'something': 'else'}) + self.assertXArrayEqual(v3, +v3) # binary ops with all variables self.assertArrayEqual(v + v, 2 * v) w = XArray(['x'], y, {'foo': 'bar'})