From a51f2515b7fa34c2d746941eca31cb56292903d2 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 16 Jan 2025 12:25:58 +0000 Subject: [PATCH 01/25] Big rework and expand docs. --- docs/userdocs/user_guide/_snippets.rst | 89 +++ docs/userdocs/user_guide/data_objects.rst | 274 +++++++++ .../userdocs/user_guide/design_principles.rst | 1 + docs/userdocs/user_guide/general_topics.rst | 200 +++++++ docs/userdocs/user_guide/howtos.rst | 525 ++++++++++++++++++ docs/userdocs/user_guide/known_issues.rst | 7 + docs/userdocs/user_guide/user_guide.rst | 6 +- 7 files changed, 1099 insertions(+), 3 deletions(-) create mode 100644 docs/userdocs/user_guide/_snippets.rst create mode 100644 docs/userdocs/user_guide/data_objects.rst create mode 100644 docs/userdocs/user_guide/general_topics.rst create mode 100644 docs/userdocs/user_guide/howtos.rst diff --git a/docs/userdocs/user_guide/_snippets.rst b/docs/userdocs/user_guide/_snippets.rst new file mode 100644 index 0000000..f291d76 --- /dev/null +++ b/docs/userdocs/user_guide/_snippets.rst @@ -0,0 +1,89 @@ +Snippets +======== + +Notes and writeups of handy description areas, that don't yet have a home. + +Data component (NameMap) dictionaries +------------------------------------- +For all of these properties, dictionary-style behaviour means that its ``.keys()`` +is a sequence of the content names, and ``.values()`` is a sequence of the contained +objects. + + +NcData +------ +The :class:`~ncdata.NcData` class represents either a dataset or group, +the structures of these are identical. + +NcAttributes +------------ +attributes are stored as NcAttribute objects, rather than simple name: value maps. +thus an 'attribute' of a NcVariable or NcData is an attribute object, not a value. + +Thus: + + >>> variable.attributes['x'] + NcAttribute('x', [1., 2., 7.]) + +The attribute has a ``.value`` property, but it is most usefully accessed with the +:meth:`~ncdata.NcAttribute.as_python_value()` method : + + >>> attr = NcAttribute('b', [1.]) + >>> attr.value + array([1.]) + >>> attr.as_python_value() + array(1.) + + >>> attr = NcAttribute('a', "this") + >>> attr.value + array('this', dtype='>> attr.as_python_value() + 'this' + +From within a parent object's ``.attributes`` dictionary, + + +Component Dictionaries +---------------------- +ordering +- insert, remove, rename effects +re-ordering + + +As described :ref:`above `, sub-components are stored under their names +in a dictionary container. + +Since all components have a name, and are stored by name in the parent property +dictionary (e.g. ``variable.attributes`` or ``data.dimensions``), the component +dictionaries have an :meth:`~ncdata.NameMap.add` method, which works with the component +name. +supported operations +^^^^^^^^^^^^^^^^^^^^ +standard dict methods : del, getitem, setitem, clear, append, extend +extra methods : add, addall + +ordering +^^^^^^^^ +For Python dictionaries in general, +since `announced in Python 3.7 `_, +the order of the entries is now a significant and stable feature of Python dictionaries. +There +Also as for Python dictionaries generally, there is no particular assistance for +managing or using the order. The following may give some indication: + +extract 'n'th item: ``data.variables[list(elelments.keys())[n]]`` +sort the list: + # get all the contents, sorted by name + content = list(data.attributes.values()) + content = sorted(content, key= lambda v: v.name) + # clear the container -- necessary to forget the old ordering + data.attributes.clear() + # add all back in, in the new order + data.attributes.addall(content) + +New entries are added last, and renamed entries retain their + +The :meth:`~ncdata.utils/dataset_differences` method reports differences in the +ordering of components (unless turned off). + + diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst new file mode 100644 index 0000000..3c937c3 --- /dev/null +++ b/docs/userdocs/user_guide/data_objects.rst @@ -0,0 +1,274 @@ +Core Data Objects +================= +Ncdata uses Python objects to represent netCDF data, and allows the user to freely +inspect and/or modify it, aiming to do this is the most natural and pythonic way. + +.. _data-model: + +Data Classes +------------ +The data model components are elements of the +`NetCDF Classic Data Model`_ , plus **groups** (from the 'enhanced' netCDF model). + +That is, a Dataset(File) consists of just Dimensions, Variables, Attributes and +Groups. + +.. note:: + We are not, as yet, explicitly supporting the NetCDF4 extensions to variable-length + and user types. See : :ref:`data-types` + +The core ncdata classes representing these Data Model components are +:class:`~ncdata.NcData`, :class:`~ncdata.NcDimension`, :class:`~ncdata.NcVariable` and +:class:`~ncdata.NcAttribute`. + +Notes : + +* There is no "NcGroup" class : :class:`~ncdata.NcData` is used for both the "group" and + "dataset" (aka file). + +* All data objects have a ``.name`` property, but this can be empty (``None``) when it is not + contained in a parent object as a component. See :ref:`components-and-containers`, + below. + + +:class:`~ncdata.NcData` +^^^^^^^^^^^^^^^^^^^^^^^ +This represents a dataset containing variables, attributes and groups. +It is also used to represent groups. + +:class:`~ncdata.NcDimension` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This represents a dimension, defined in terms of name, length, and whether "unlimited" +(or not). + +:class:`~ncdata.NcVariable` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Represents a data variable, with dimensions and, optionally, data and attributes. + +Note that ``.dimensions`` is simply a list of names (strings) : they are not +:class:`~ncdata.NcDimension` objects, and not linked to actual dimensions of the +dataset, so *actual* dimensions are only identified dynamically, when they need to be. + +Variables can be created with either real (numpy) or lazy (dask) arrays, or no data at +all. + +A variable has a ``.dtype``, which may be set if creating with no data. +However, at present, after creation ``.data`` and ``.dtype`` can be reassigned and there +is no further checking of any sort. + +.. _variable-dtypes: + +Variable Data Arrays +"""""""""""""""""""" +When a variable does have a ``.data`` property, this will be an array, with at least +the usual ``shape``, ``dtype`` and ``__getitem__`` properties. In practice we assume +for now that we will always have real (numpy) or lazy (dask) arrays. + +When data is exchanged with an actual file, it is simply written if real, and streamed +(via :meth:`dask.array.store`) if lazy. + +When data is exchanged with supported data analysis packages (i.e. Iris or Xarray, so +far), these arrays are transferred directly without copying or making duplicates (such +as numpy views). +This is a core principle (see :ref:`design-principles`), but may require special support in +those packages. + +See also : :ref:`data-types` + +:class:`~ncdata.NcAttribute` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Represents an attribute, with name and value. The value is always either a scalar +or a 1-D numpy array -- this is enforced as a computed property (read and write). + +.. _attribute-dtypes: + +Attribute Values +"""""""""""""""" +In actual netCDF data, the value of an attribute is effectively limited to a one-dimensional +array of certain valid netCDF types, and one-element arrays are exactly equivalent to scalar values. + +In ncdata, the ``.value`` of an :class:`ncdata.NcAttribute` must always be a numpy array, and +when creating one the provided ``.value`` is cast with :func:`numpy.asanyarray`. + +However you are not prevented from setting an attributes ``.value`` to something other than +an array, which may cause an error. So for now, if setting the value of an existing attribute, +ensure you always write compatible numpy data, or use :meth:`ncdata.NameMap.set_attrval` which is safe. + +For *reading* attributes, it is best to use :meth:`ncdata.NameMap.get_attrval` or (equivalently) +:meth:`ncdata.NcAttribute.as_python_value()` : These consistently return either +``None`` (if missing); a numpy scalar; or array; or a Python string. Those results are +intended to be equivalent to what you should get from storing in an actual file and reading back, +including re-interpreting a length-one vector as a scalar value. + +.. attention:: + The correct handling and (future) discrimination of string data as character arrays ("char" in netCDF terms) + and/or variable-length strings ("string" type) is still to be determined. + + For now, we are converting **all** string attributes to python strings. + + There is **also** a longstanding known problem with the low-level C (and FORTRAN) interface, which forbids the + creation of vector character attributes, which appear as single concatenated strings. So for now, **all** + string-type attributes appear as single Python strings (you never get an array of strings or list of strings). + +See also : :ref:`data-types` + +.. _correctness-checks: + +Correctness and Consistency +--------------------------- +In practice, to support flexibility in construction and manipulation, it is +not practical for ncdata structures to represent valid netCDF at +all times, since this would makes changing things awkward. +For example, if a group refers to a dimension *outside* the group, you could not simply +extract it from the dataset because it is not valid in isolation. + +Thus, we do allow that ncdata structures represent *invalid* netCDF data. +For example, circular references, missing dimensions or naming mismatches. +Effectively there are a set of data validity rules, which are summarised in the +:func:`ncdata.utils.save_errors` routine. + +In practice, there a minimal set of runtime rules for creating ncdata objects, and +additional requirements when ncdata is converted to actual netCDF. For example, +variables can be initially created with no data. But if subsequently written to a file, +data must be assigned first. + +.. Note:: + These issues are not necessarily all fully resolved. Caution required ! + +.. _components-and-containers: + +Components, Containers and Names +-------------------------------- +Each dimension, variable, attribute or group normally exists as a component in a +parent dataset (or group), where it is stored in a "container" property of the parent, +i.e. either its ``.dimensions``, ``.variables``, ``.attributes`` or ``.groups``. + +Each of the "container" properties is a :class:`~ncdata._core.NameMap` object, which +is a dictionary type mapping a string (name) to a specific type of components. +The dictionary``.keys()`` are a sequence of component names, and its ``.values()`` are +the corresponding contained components. + +Every component object also has a ``.name`` property. By this, it is implicit that you +**could** have a difference between the name by which the object is indexed in its +container, and its ``.name``. This is to be avoided ! + +The :meth:`~ncdata.NameMap` container class is provided with convenience methods which +aim to make this easier, such as :meth:`~ncdata.NameMap.add` and +:meth:`~ncdata.NameMap.rename`. + +NcData and NcVariable ".attributes" components +---------------------------------------------- +Note that the contents of a ".attributes" are :class:`~ncdata.NcAttributes` objects, +not attribute values. + +Thus to fetch an attribute you might write, for example one of these : + +.. code-block:: + + units1 = dataset.variables['var1'].get_attrval('units') + units1 = dataset.variables['var1'].attributes['units'].as_python_value() + +but **not** ``unit = dataset.variables['x'].attributes['attr1']`` + +And not ``unit = dataset.variables['x'].attributes['attr1']`` + +Or, likewise, to ***set*** values, one of + +.. code-block:: + + dataset.variables['var1'].set_attrval('units', "K") + dataset.variables['var1'].attributes['units'] = NcAttribute("units", K) + +but **not** ``dataset.variables['x'].attributes['units'].value = "K"`` + + +Container ordering +------------------ +The order of elements of a container is technically significant, and does constitute a +potential difference between datasets (or files). + +The :meth:`ncdata.NameMap.rename` method preserves the order of an element, +while :meth:`ncdata.NameMap.add` adds the new components at the end. + +The :func:`ncdata.utils.dataset_differences` utility provides various keywords allowing +you to ignore ordering in comparisons, when required. + + +Container methods +----------------- +The :class:`~ncdata.NameMap` class also provides a variety of manipulation methods, +both normal dictionary operations and some extra ones. + +The most notable ones are : ``del``, ``pop``, ``add``, ``addall``, ``rename`` and of +course ``__setitem__`` . + +See :ref:`common_operations` section. + +.. _data-constructors: + +Core Object Constructors +------------------------ +The ``__init__`` methods of the core classes are designed to make in-line definition of +new objects in user code reasonably legible. So, when initialising one of the container +properties, the keyword/args defining component parts use the utility method +:meth:`ncdata.NameMap.from_items` so that you can specify a group of components in a variety of ways : +either a pre-created container or a similar dictionary-like object : + +.. code-block:: python + + >>> ds1 = NcData(groups={ + ... 'x':NcData('x'), + ... 'y':NcData('y') + ... }) + >>> print(ds1) + + groups: + + + > + +or **more usefully**, just a *list* of suitable data objects, like this... + +.. code-block:: python + + >>> ds2 = NcData( + ... variables=[ + ... NcVariable('v1', ('x',), data=[1,2]), + ... NcVariable('v2', ('x',), data=[2,3]) + ... ] + ... ) + >>> print(ds2) + + variables: + + + > + +Or, in the **special case of attributes**, a regular dictionary of ``name: value`` form +will be automatically converted to a NameMap of ``name: NcAttribute(name: value)`` : + +.. code-block:: python + + >>> var = NcVariable( + ... 'v3', + ... attributes={'x':'this', 'b':1.4, 'arr': [1, 2, 3]} + ... ) + >>> print(var) + ): v3() + v3:x = 'this' + v3:b = 1.4, + v3:arr = array([1, 2, 3]) + > + + +Relationship to File Storage +---------------------------- +Note that file-specific storage aspects, such as chunking, data-paths or compression +strategies, are not recorded in the core objects. However, array representations in +variable and attribute data (notably dask lazy arrays) may hold such information. +The concept of "unlimited" dimensions is arguably an exception. However, this is a +core provision in the NetCDF data model itself (see "Dimension" in the `NetCDF Classic Data Model`_). + +.. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model diff --git a/docs/userdocs/user_guide/design_principles.rst b/docs/userdocs/user_guide/design_principles.rst index 657483a..3ac2e19 100644 --- a/docs/userdocs/user_guide/design_principles.rst +++ b/docs/userdocs/user_guide/design_principles.rst @@ -11,6 +11,7 @@ Purpose * allow analysis packages (Iris, Xarray) to exchange data efficiently, including lazy data operations and streaming +.. _design-principles: Design Principles ----------------- diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst new file mode 100644 index 0000000..661278a --- /dev/null +++ b/docs/userdocs/user_guide/general_topics.rst @@ -0,0 +1,200 @@ +.. _common_operations: + +Common Operations +================= +A group of common operations are available on all the core component types, +i.e. the operations of extract/remove/insert/rename/copy on the ``.datasets``, ``.groups``, +``.dimensions``, ``.variables`` and ``.attributes`` properties of the core objects. + +Most of these are hopoefully "obvious" Pythonic methods of the container objects. + +Extract and Remove +------------------ +These are implemented as :meth:`~ncdata.NameMap.__delitem__` and :meth:`~ncdata.NameMap.pop` +methods, which work in the usual way. + +Examples : + +* ``var_x = dataset.variables.pop("x")`` +* ``del data.variables["x"]`` + +Insert / Add +------------ +A new content (component) can be added under its own name with the +:meth:`~ncdata.NameMap.add` method. + +Example : ``dataset.variables.add(NcVariable("x", dimensions=["x"], data=my_data))`` + +An :meth:`~ncdata.NcAttribute` can also be added or set (if already present) with the special +:meth:`~ncdata.NameMap.set_attrval` method. + +Example : ``dataset.variables["x"].set_attrval["units", "m s-1")`` + +Rename +------ +A component can be renamed with the :meth:`~ncdata.NameMap.rename` method. This changes +both the name in the container **and** the component's own name -- it is not recommended +ever to set ``component.name`` directly, as this obviously can become inconsistent. + +Example : ``dataset.variables.rename("x", "y")`` + +.. warning:: + Renaming a dimension will not rename references to it (i.e. in variables), which + obviously may cause problems. + We may add a utility to do this safely this in future. + +Copy +---- +All core objects support a ``.copy()`` method, which however does not copy array content +(e.g. variable data or attribute arrays). See for instance :meth:`ncdata.NcData.copy`. + +There is also a utility function :func:`ncdata.utils.ncdata_copy`, this is effectively +the same as the NcData object copy. + + +Creation +-------- +The constructors should allow reasonably readable inline creation of data. +See here : :ref:`data-constructors` + +Ncdata is deliberately not very fussy about 'correctness', since it is not tied to an actual +dataset which must "make sense". see : :ref:`correctness-checks` . + +Hence, there is no great need to install things in the 'right' order (e.g. dimensions +before variables which need them). You can create objects in one go, like this : + +.. code-block:: + + data = NcData( + dimensions=[ + NcDimension("y", 2), + NcDimension("x", 3), + ], + variables=[ + NcVariable("y", dimensions=["y"], data=[10, 11]), + NcVariable("x", dimensions=["y"], data=[20, 21, 22]), + NcVariable("dd", dimensions=["y", "x"], data=[[0, 1, 2], [3, 4, 5]]) + ] + ) + + +or iteratively, like this : + +.. code-block:: + + data = NcData() + dims = [("y", 2), ("x", 3)] + data.variables.addall([ + NcVariable(nn, dimensions=[nn], data=np.arange(ll)) + for ll, nn in dims + ]) + data.variables.add( + NcVariable("dd", dimensions=["y", "x"], + data=np.arange(6).reshape(2,3)) + ) + data.dimensions.addall([NcDimension(nn, ll) for nn, ll in dims]) + +Note : here, the variables were created before the dimensions + + +Equality Checks +--------------- +We provide a simple ``==`` check for all the core objects but this can be very costly, +at least for variables, because it will check all the data, even in lazy arrays (!). + +You can use :func:`ncdata.utils.dataset_differences` for much more nuanced and controllable +checking. + + +Validity Checking +----------------- +See : :ref:`correctness-checks` + +General Topics +============== +Odd discussion topics + +.. _data-types: + +Data Types (dtypes) +------------------- +:ref:`Variable data ` and :ref:`attribute values ` +all use a subset of numpy **dtypes**, compatible with netcdf datatypes. +These are effectively those defined by `netcdf4-python `_, and this +therefore also effectively determines what we see in `dask arrays `_ . + +However, at present ncdata directly supports only the `NetCDF Classic Data Model`_ (plus groups, +see : :ref:`data-model`). +So, this does ***not*** include the user-defined, enumerated or variable-length datatypes. + +.. attention:: + + In practice, we have found that at least variables of the variable-length "string" datatype do seem to function + correctly at present, but this is not officially supported, and not currently tested. + + We hope to extend support to the more general `NetCDF Enhanced Data Model`_ in future. + +As-of January 2025 there is + +.. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model + +.. _NetCDF Enhanced Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model + + +.. _character-data: + +Character Data +-------------- +NetCDF can can contain string and character data in at least 3 different contexts : + +1. in variable data arrays +2. in attribute values +3. in names of components (i.e. dimensions / variables / attributes / groups ) + +The first case (3.) is, effectively, quite separate. +Since NetCDF version 4, the names of items within files are fully unicode compliant and can +use virtually ***any*** characters, with the exception of the forward slash "/" +( since in some technical cases a component name needs to specified as a "path-like" compound ) + +.. _thread-safety: + +Thread Safety +------------- +In short, it turns out that thread safety can be an issue whenever "lazy" data is being read, which occurs whenever +data is being plotted, calculated or written to a new output file. + +Whenever data is being "computed" (in Dask terms : see `Dask compute `_), that was loaded using more than +one of the Iris, Xarray and ncdata.netcdf4 packages, then :mod:`ncdata.threadlock_sharing` must be used to avoid +possible errors. + +A Fuller Explanation.. +^^^^^^^^^^^^^^^^^^^^^^ +In practice, Iris, Xarray and Ncdata are all capable of scanning netCDF files and interpreting +their metadata, while **not** reading all the core variable data contained in them. + +The file load generates `Dask.arrray `_ objects representing sections of +variable data for calculation on later request, with certain key benefits : + +1. no data loading or calculation happens until needed +2. the work is divided into sectional 'tasks', of which only some may ultimately be needed +3. it may be possible to perform multiple sections of calculation (including data fetch) in parallel +4. it may be possible to localise operations (fetch or calculate) near to data distributed across a cluster + +Usually, the most efficient parallelisation of array operations is by multi-threading, +since that can use memory sharing of large data arrays in memory. However, the python netCDF4 library is **not threadsafe**, +therefore the "netcdf fetch" call in each input operation must be guarded by a mutex. + +So Xarray, Iris and ncdata all create data objects with Dask arrays, which reference input data chunks fetching sections +of the input files. Each of those uses a mutex to stop it accessing the netCDF4 interface at the same time as +any of the others. + +This works beautifully **until** ncdata connects lazy data loaded with Iris (say) with lazy data loaded from Xarray, +which unfortunately are using their own *separate* mutexes to protect the *same* netcdf library. Then, when we attempt +to calculate or save this result, we may get sporadic and unpredictable system-level errors, even a core-dump. + +So, the function of :mod:`ncdata.threadlock_sharing` is to **connect** the thread-locking schemes of the separate libraries, +so that they cannot accidentally overlap an access call from the other package in a different thread, +just as they already cannot overlap one of their own. + +.. _dask-array: https://docs.dask.org/en/stable/array.html +.. _dask-compute: https://docs.dask.org/en/latest/generated/dask.array.Array.compute.html \ No newline at end of file diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst new file mode 100644 index 0000000..f58a700 --- /dev/null +++ b/docs/userdocs/user_guide/howtos.rst @@ -0,0 +1,525 @@ +How-To Questions +================ +Short goal-focussed descriptions of how to achieve specific things. +These are mostly presented as example code snippets, but also link to other +documentation to describe concepts and technical details. + +**"Why Not Just..."** sections highlight warnings for what *not* to do, +i.e. wrong turns and gotchas, with brief descriptions of why. + + +.. _howto_access: + +Access a data object +-------------------- +Index by component names to get the object which represents a particular element. + +.. code-block:: python + + >>> dataset.attributes["experiment"] + NcAttribute("'experiment', 'A301.7') + >>> dataset.dimensions["x"] + NcDimension('x', 3) + >>> dataset.variables['vx'].attributes['units'] + NcAttribute("'unit', 'm s-1') + +Variable, attributes, dimensions and sub-groups are all stored by name like this, +in a parent property which is a "component container" dictionary. + +.. Warning:: + + The :attr:`~ncdata.NcVariable.dimensions` property of a :class:`~ncdata.NcVariable` + is different : it is *not* a dictionary of :class:`~ncdata.NcDimension` objects, + but just a *list of dimension names*. + + +.. _howto_add_something: + +Add a data object +----------------- +Use the :meth:`~ncdata.NameMap.add` method of a component-container property to insert +a new item. + + >>> data.dimensions.add(NcDimension("y", 4)) + >>> data.dimensions + {'x': NcDimension('x', 3) 'y': NcDimension('y', 3)} + +The item must be of the correct type, in this case a :class:`~ncdata.NcDimension`. +If not, an error will be raised. + +.. Warning:: + + **Why Not Just...** ``data.dimensions["y"] = NcDimension("y", 4)`` ? + + This does actually work, but the user must ensure that the dictionary key always + matches the name of the component added. Using :meth:`~ncdata.NameMap.add` is thus + safe, and actually *simpler*, since all components have a definite name anyway. + + +.. _howto_remove_something: + +Remove a data object +-------------------- +The standard Python ``del`` operator can be applied to a component property to remove +something by its name. + + >>> data.dimensions + {'x': NcDimension('x', 3) 'y': NcDimension('y', 3)} + >>> del data.dimensions['x'] + >>> data.dimensions + {'y': NcDimension('y', 3)} + + +.. _howto_rename_something: + +Rename a data object +-------------------- +Use the :meth:`~ncdata.NameMap.rename` method to rename a component. + +.. code-block:: + + >>> data.dimensions + {'x': NcDimension('x', 3) 'y': NcDimension('y', 3)} + >>> data.dimensions.rename['x', 'q'] + >>> data.dimensions + {'q': NcDimension('q', 3) 'y': NcDimension('y', 3)} + +Note that this affects both the element's container key *and* its ``.name``. + + +.. Warning:: + + Renaming a **dimension** can cause problems, so must be done with care. + See :ref:`howto_rename_dimension`. + +.. Warning:: + + **Why Not Just...** ``dim = data.dimensions['x']; dim.name = "q"`` ? + + This would break the expected ``key == elements[key].name`` rule. + We don't prevent this, but it is usually a mistake. + :func:`~ncdata.utils.save_errors` detects this type of problem. + + +.. _howto_rename_dimension: + +Rename a dimension +------------------ +Simply using ``ncdata.dimensions.rename()`` can cause problems, because you must then +**also** replace the name where it occurs in the dimensions of any variables. + +.. Note:: + + **To-Do** : there should be a utility for this, but as yet it does not exist. + See `Issue#87 `_. + + +.. _howto_read_attr: + +Read an attribute value +----------------------- +To get an attribute of a dataset, group or variable, use the +:meth:`ncdata.NcData.get_attrval` or :meth:`ncdata.NcVariable.get_attrval` +method, which returns either a single (scalar) number, a numeric array, or a string. + +.. code-block:: python + + >>> variable.get_attr("x") + 3.0 + >>> dataset.get_attr("context") + "Results from experiment A301.7" + >>> dataset.variables["q"].get_attr("level_settings") + [1.0, 2.5, 3.7] + +**Given an isolated** :class:`ncdata.NcAttribute` **instance** : + +Its value is best read with the :meth:`ncdata.NcAttribute.get_python_value` method, +which produces the same results as the above. + + >>> variable.attributes[myname].get_python_value() + 3.0 + +.. Note:: + + **Why Not Just...** use ``NcAttribute.value`` ? + + For example + + .. code-block:: python + + >>> data.variables["x"].attributes["q"].value + [1] + + The ``.value`` is always stored as a :class:`~numpy.ndarray` array, but this is not + how it is stored in netCDF. The ``get_python_value()`` returns the attribute + as a straightforward value, compatible with what is seen in ``ncdump`` output, + and results from the ``netCDF4`` module. + + +.. _howto_write_attr: + +Change an attribute value +------------------------- +To set an attribute of a dataset, group or variable, use the +:meth:`ncdata.NcData.set_attrval` or :meth:`ncdata.NcVariable.set_attrval` method. + +All attributes are writeable, and the type can be freely changed. + +.. code-block:: python + + >>> variable.set_attr("x", 3.) + >>> variable.get_attr("x") + 3.0 + >>> variable.set_attr("x", "string-value") + >>> variable.get_attr("x") + "string-value" + +.. Note:: + + **Why Not Just...** set ``NcAttribute.value`` directly ? + + For example + + .. code-block:: python + + >>> data.variables["x"].attributes["q"].value = 4.2 + + This is generally unwise, because the ``.value`` should always be a numpy + :class:`~numpy.ndarray` array, with a suitable ``dtype``, but the + :class:`~ncdata.Ncattribute` type does not currently enforce this. + The ``set_attrval`` method both converts for convenience, and ensures that the + value is stored in a valid form. + + +.. _howto_create_attr: + +Create an attribute +------------------- +To create an attribute on a dataset, group or variable, just set its value with the +:meth:`ncdata.NcData.set_attrval` or :meth:`ncdata.NcVariable.set_attrval` method. +This works just like :ref:`howto_write_attr` : i.e. it makes no difference whether the +attribute already exists or not. + +.. code-block:: python + + >>> variable.set_attr("x", 3.) + +.. Note:: + + Assigning attributes when *creating* a dataset, variable or group is somewhat + simpler, discussed :ref:`here `. + + +.. _howto_create_variable: + +Create a variable +----------------- +Use the :meth:`NcVariable() ` constructor to create a new +variable with a name, dimensions, and optional data and attributes. + +A minimal example: + +.. code-block:: python + + >>> var = NcVariable("data", ("x_axis",)) + >>> print(var) + ): data(x_axis)> + >>> print(var.data) + None + >>> + +A more rounded example, including a data array: + +.. code-block:: python + + >>> var = NcVariable("vyx", ("y", "x"), + ... data=[[1, 2, 3], [0, 1, 1]], + ... attributes=[NcAttribute('a', 1), NcAttribute('b', 'setting=off')] + ... ) + >>> print(var) + + >>> print(var.data) + [[1 2 3] + [0 1 1]] + >>> + + + +.. _howto_access_vardata: + +Read or write variable data +--------------------------- +The :attr:`~ncdata.NcVariable.data` property of a :class:`~ncdata.NcVariable` usually +holds a data array. + +.. code-block:: python + + >>> var.data = np.array([1, 2]) + >>> print(var.data) + +This may be either a :class:`numpy.ndarray` (real) or a :class:`dask.array.Array` +(lazy) array. If the data is converted from another source (file, iris or xarray), +it is usually lazy. + +It can be freely overwritten by the user. + +.. Warning:: + + If not ``None``, the ``.data`` should **always** be an array of the correct shape. + + The :func:`~ncdata.utils.save_errors` function checks that all variables have + valid dimensions, and that ``.data`` arrays match the dimensions. + + + +Save data to a new file +----------------------- +Use the :func:`ncdata.netcdf4.to_nc4` function to write data to a file: + +.. code-block:: python + + >>> from ncdata.netcdf4 import to_nc4 + >>> to_nc4(data, filepath) + + +Read from or write to Iris cubes +-------------------------------- +Use :func:`ncdata.iris.to_iris` and :func:`ncdata.iris.from_iris`. + +.. code-block:: python + + >>> from ncdata.iris import from_iris, to_iris + >>> cubes = iris.load(file) + >>> ncdata = from_iris(cubes) + >>> + >>> cubes2 = to_iris(ncdata) + +Note that: + +* :func:`ncdata.iris.to_iris` calls :func:`iris.load` +* :func:`ncdata.iris.from_iris` calls :func:`iris.save` + +Extra kwargs are passed on to the iris load/save routine. + +Since an :class:`~ncdata.NcData` is like a complete file, or dataset, it is written to +or read from multiple cubes, in a :class:`~iris.cube.CubeList`. + + +Read from or write to Xarray datasets +------------------------------------- +Use :func:`ncdata.xarray.to_xarray` and :func:`ncdata.xarray.from_xarray`. + +.. code-block:: python + + >>> from ncdata.xarray import from_xarray, to_xarray + >>> dataset = xarray.open_dataset(filepath) + >>> ncdata = from_xarray(dataset) + >>> + >>> ds2 = to_xarray(ncdata) + +Note that: + +* :func:`ncdata.xarray.to_xarray` calls :func:`xarray.Dataset.load_store`. + +* :func:`ncdata.xarray.from_xarray` calls :func:`xarray.Dataset.dump_to_store` + +Any additional kwargs are passed on to the xarray load/save routine. + +An NcData writes or reads as an :class:`xarray.Dataset`. + + + +Convert data directly from Iris to Xarray, or vice versa +-------------------------------------------------------- +Use :func:`ncdata.iris_xarray.cubes_to_xarray` and +:func:`ncdata.iris_xarray.cubes_from_xarray`. + +.. code-block:: python + + >>> from ncdata.iris_xarray import cubes_from_xarray, cubes_to_xarray + >>> cubes = iris.load(filepath) + >>> dataset = cubes_to_xarray(cubes) + >>> + >>> cubes2 = cubes_from_xarray(dataset) + +These functions are simply a convenient shorthand for combined use of +:func:`ncdata.xarray.from_xarray` then :func:`ncdata.iris.to_iris`, +or :func:`ncdata.iris.from_iris` then :func:`ncdata.xarray.to_xarray`. + +Extra keyword controls for the relevant iris and xarray load and save routines can be +passed using specific dictionary keywords, e.g. + +.. code-block:: python + + >>> cubes = cubes_from_xarray( + ... dataset, + ... iris_load_kwargs={'constraints': 'air_temperature'}, + ... xr_save_kwargs={'unlimited_dims': ('time',)}, + ... ) + ... + +Combine data from different input files into one output +------------------------------------------------------- +This can be + + +Create a brand-new dataset +-------------------------- +Use the :meth:`NcData() <~ncdata.NcData.__init__>` constructor to create a new dataset. + +Contents and components can be attached on creation ... + +.. code-block:: python + + >>> data = NcData( + >>> dimensions=[NcDimension("y", 2), NcDimension("x", 3)], + >>> variables=[ + >>> NcVariable("y", ("y",), data=[0, 1]), + >>> NcVariable("x", ("x",), data=[0, 1, 2]), + >>> NcVariable( + >>> "vyx", ("y", "x"), + >>> data=np.zeros((2, 3)), + >>> attributes=[ + >>> NcAttribute("long_name", "rate"), + >>> NcAttribute("units", "m s-1") + >>> ] + >>> )], + >>> attributes=[NcAttribute("history", "imaginary")]) + ... + >>> print(data) + + dimensions: + y = 2 + x = 3 + + variables: + + ... + +... or added iteratively ... + +.. code-block:: python + + >>> data = NcData() + >>> ny, nx = 2, 3 + >>> data.dimensions.add(NcDimension("y", ny)) + >>> data.dimensions.add(NcDimension("x", nx)) + >>> data.variables.add(NcVariable("y", ("y",))) + >>> data.variables.add(NcVariable("x", ("x",))) + >>> data.variables.add(NcVariable("vyx", ("y", "x"))) + >>> vx, vy, vyx = [data.variables[k] for k in ("x", "y", "vyx")] + >>> vx.data = np.arange(nx) + >>> vy.data = np.arange(ny) + >>> vyx.data = np.zeros((ny, nx)) + >>> vyx.set_attrval("long_name", "rate"), + >>> vyx.set_attrval("units", "m s-1") + >>> data.set_attrval("history", "imaginary") + + +Remove or rewrite specific attributes +------------------------------------- + + +Save selected variables to a new file +------------------------------------- +Load input with :func:`ncdata.netcdf4.from_nc4`; use :meth:`ncdata.NameMap.add` to add +selected elements into a new :class:`ncdata.Ncdata`, and then save it +with :func:`ncdata.netcdf4.to_nc4`. + +For a simple case with no groups, it could look something like this: + +.. code-block:: python + + >>> input = from_nc4(input_filepath) + >>> output = NcData() + >>> for varname in ('data1', 'data2', 'dimx', 'dimy'): + >>> var = input.variables[varname] + >>> output.variables.add(var) + >>> for name in var.dimensions if name not in output.dimensions: + >>> output.dimensions.add(input.dimensions[dimname]) + ... + >>> to_nc4(output, output_filepath) + +Sometimes it's simpler to load the input, delete content **not** wanted, then re-save. +It's perfectly safe to do that, since the original file will be unaffected. + +.. code-block:: python + + >>> data = from_nc4(input_filepath) + >>> for name in ('extra1', 'extra2', 'unwanted'): + >>> del data.variables[varname] + ... + >>> del data.dimensions['pressure'] + >>> to_nc4(data, output_filepath) + + +Adjust file content before loading into Iris/Xarray +--------------------------------------------------- +Use :func:`~ncdata.netcdf4.from_nc4`, and then :func:`~ncdata.iris.to_iris` or +:func:`~ncdata.xarray.to_xarray`. You can thus adjust file content at the file level, +to avoid loading problems. + +For example, to replace an invalid coordinate name in iris input : + +.. code-block:: python + + >>> from ncdata.netcdf4 import from_nc4 + >>> from ncdata.iris import to_iris + >>> ncdata = from_nc4(input_filepath) + >>> for var in ncdata.variables: + >>> coords = var.attributes.get('coordinates', "") + >>> if "old_varname" in coords: + >>> coords.replace("old_varname", "new_varname") + >>> var.set_attrval("coordinates", coords) + ... + >>> cubes = to_iris(ncdata) + +or, to replace a mis-used special attribute in xarray input : + +.. code-block:: python + + >>> from ncdata.netcdf4 import from_nc4 + >>> from ncdata.xarray import to_xarray + >>> ncdata = from_nc4(input_filepath) + >>> for var in ncdata.variables: + >>> if "_fillvalue" in var.attributes: + >>> var.attributes.rename("_fillvalue", "_FillValue") + ... + >>> cubes = to_iris(ncdata) + + +Adjust Iris/Xarray save output before writing to a file +------------------------------------------------------- +Use :func:`~ncdata.iris.from_iris` or :func:`~ncdata.xarray.from_xarray`, and then +:func:`~ncdata.netcdf4.to_nc4`. You can thus make changes to the saved output which +would be difficult to overcome if first written to an actual file. + +For example, to force an additional unlimited dimension in iris output : + +.. code-block:: python + + >>> from ncdata.iris import from_iris + >>> from ncdata.netcdf4 import to_nc4 + >>> ncdata = from_iris(cubes) + >>> ncdata.dimensions['timestep'].unlimited = True + >>> to_nc4(ncdata, "output.nc") + +or, to convert xarray data variable output to masked integers : + +.. code-block:: python + + >>> from numpy import ma + >>> from ncdata.iris import from_xarray + >>> from ncdata.netcdf4 import to_nc4 + >>> ncdata = from_xarray(dataset) + >>> var = ncdata.variables['experiment'] + >>> mask = var.data.isnan() + >>> data = var.data.astype(np.int16) + >>> data[mask] = -9999 + >>> var.data = data + >>> var.set_attrval("_FillValue", -9999) + >>> to_nc4(ncdata, "output.nc") + diff --git a/docs/userdocs/user_guide/known_issues.rst b/docs/userdocs/user_guide/known_issues.rst index 12c8b1e..8430e61 100644 --- a/docs/userdocs/user_guide/known_issues.rst +++ b/docs/userdocs/user_guide/known_issues.rst @@ -22,6 +22,13 @@ To be fixed * `issue#66 `_ +.. _todo: + +Incomplete Documentation +^^^^^^^^^^^^^^^^^^^^^^^^ +(PLACEHOLDER: documentation is incomplete, please fix me !) + + Identified Design Limitations ----------------------------- diff --git a/docs/userdocs/user_guide/user_guide.rst b/docs/userdocs/user_guide/user_guide.rst index 9b0b938..65500da 100644 --- a/docs/userdocs/user_guide/user_guide.rst +++ b/docs/userdocs/user_guide/user_guide.rst @@ -18,8 +18,8 @@ For the present, please see the following : :maxdepth: 2 design_principles - (TODO : empty) Data object descriptions - (TODO : empty) General topics - (TODO : empty) How-tos + data_objects + general_topics + howtos known_issues ../../change_log From 5e815437b6bc9d66b42568c0c36ee90d690fb5a0 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 16 Jan 2025 17:04:28 +0000 Subject: [PATCH 02/25] Lots more improvements + move sections. --- docs/change_log.rst | 10 +- docs/details/details_index.rst | 4 + docs/details/interface_support.rst | 14 +- .../user_guide => details}/known_issues.rst | 2 +- docs/details/threadlock_sharing.rst | 60 +++-- docs/index.rst | 5 +- .../userdocs/user_guide/common_operations.rst | 116 ++++++++++ docs/userdocs/user_guide/data_objects.rst | 6 +- docs/userdocs/user_guide/general_topics.rst | 215 +++++------------- docs/userdocs/user_guide/howtos.rst | 29 +++ docs/userdocs/user_guide/user_guide.rst | 16 +- 11 files changed, 271 insertions(+), 206 deletions(-) rename docs/{userdocs/user_guide => details}/known_issues.rst (95%) create mode 100644 docs/userdocs/user_guide/common_operations.rst diff --git a/docs/change_log.rst b/docs/change_log.rst index 2080abf..e846bda 100644 --- a/docs/change_log.rst +++ b/docs/change_log.rst @@ -1,22 +1,22 @@ Versions and Change Notes ========================= -Project Status --------------- +Project Development Status +-------------------------- We intend to follow `PEP 440 `_, or (older) `SemVer `_ versioning principles. This means the version string has the basic form **"major.minor.bugfix[special-types]"**. -Current release version is at **"v0.1"**. +Current release version is at **"v0.2"**. -This is a first complete implementation, -with functional operational of all public APIs. +This is a complete implementation, with functional operational of all public APIs. The code is however still experimental, and APIs are not stable (hence no major version yet). Change Notes ------------ +Summary of key features by release number Unreleased ^^^^^^^^^^ diff --git a/docs/details/details_index.rst b/docs/details/details_index.rst index c3864b3..3d443cb 100644 --- a/docs/details/details_index.rst +++ b/docs/details/details_index.rst @@ -1,8 +1,12 @@ Detail Topics ============= +Detail reference topics + .. toctree:: :maxdepth: 2 + ../change_log + ./known_issues ./interface_support ./threadlock_sharing ./developer_notes diff --git a/docs/details/interface_support.rst b/docs/details/interface_support.rst index f2fedcc..f1cd53d 100644 --- a/docs/details/interface_support.rst +++ b/docs/details/interface_support.rst @@ -35,6 +35,7 @@ array has the actual variable dtype, and the "scale_factor" and The existence of a "_FillValue" attribute controls how.. TODO +.. _file-storage: File storage control ^^^^^^^^^^^^^^^^^^^^ @@ -44,13 +45,21 @@ control the data compression and translation facilities of the NetCDF file library. If required, you should use :mod:`iris` or :mod:`xarray` for this. +Although file-specific storage aspects, such as chunking, data-paths or compression +strategies, are not recorded in the core objects. However, array representations in +variable and attribute data (notably dask lazy arrays) may hold such information. + +The concept of "unlimited" dimensions is also, arguably an exception. However, this is a +core provision in the NetCDF data model itself (see "Dimension" in the `NetCDF Classic Data Model`_). + Dask chunking control ^^^^^^^^^^^^^^^^^^^^^ Loading from netcdf files generates variables whose data arrays are all Dask lazy arrays. These are created with the "chunks='auto'" setting. -There is currently no control for this : If required, load via Iris or Xarray -instead. + +There is simple user override API available to control this on a per-dimension basis. +See :func:`ncdata.netcdf4.from_nc4`. Xarray Compatibility @@ -94,3 +103,4 @@ see : `support added in v3.7.0 `_ with deferred access +to bulk file data for later access, with certain key benefits : + +* no data loading or calculation happens until needed +* the work is divided into sectional ‘tasks’, of which only some may ultimately be needed +* it may be possible to perform multiple sections of calculation (including data fetch) in parallel +* it may be possible to localise operations (fetch or calculate) near to data distributed across a cluster + +Usually, the most efficient parallelisation of array operations is by multi-threading, since that can use memory +sharing of large data arrays in memory. + +However, the python netCDF4 library (and the underlying C library) is not threadsafe +(re-entrant) by design, neither does it implement any thread locking itself, therefore +the “netcdf fetch” call in each input operation must be guarded by a mutex. +Thus contention is possible unless controlled by the calling packages. + +*Each* of Xarray, Iris and ncdata itself create input data tasks to fetch sections of +the input files. Each uses a mutex lock around netcdf accesses in those tasks, to stop +them accessing the netCDF4 interface at the same time as any of the others. + +This works beautifully until ncdata connects lazy data loaded with Iris (say) with +lazy data loaded from Xarray, which unfortunately are using their own separate mutexes +to protect the same netcdf library. Then, when we attempt to calculate or save this +result, we may get sporadic and unpredictable system-level errors, even a core-dump. + +So, the function of :mod:`ncdata.threadlock_sharing` is to connect the thread-locking +schemes of the separate libraries, so that they cannot accidentally overlap an access +call in a different thread *from the other package*, just as they already cannot +overlap *one of their own*. diff --git a/docs/index.rst b/docs/index.rst index 09246a4..c161708 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -38,8 +38,9 @@ User Documentation User Guide <./userdocs/user_guide/user_guide> -Reference ---------- +Reference Documentation +----------------------- + .. toctree:: :maxdepth: 2 diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst new file mode 100644 index 0000000..da465a2 --- /dev/null +++ b/docs/userdocs/user_guide/common_operations.rst @@ -0,0 +1,116 @@ +.. _common_operations: + +Common Operations +================= +A group of common operations are available on all the core component types, +i.e. the operations of extract/remove/insert/rename/copy on the ``.datasets``, ``.groups``, +``.dimensions``, ``.variables`` and ``.attributes`` properties of the core objects. + +Most of these are hopoefully "obvious" Pythonic methods of the container objects. + +Extract and Remove +------------------ +These are implemented as :meth:`~ncdata.NameMap.__delitem__` and :meth:`~ncdata.NameMap.pop` +methods, which work in the usual way. + +Examples : + +* ``var_x = dataset.variables.pop("x")`` +* ``del data.variables["x"]`` + +Insert / Add +------------ +A new content (component) can be added under its own name with the +:meth:`~ncdata.NameMap.add` method. + +Example : ``dataset.variables.add(NcVariable("x", dimensions=["x"], data=my_data))`` + +An :meth:`~ncdata.NcAttribute` can also be added or set (if already present) with the special +:meth:`~ncdata.NameMap.set_attrval` method. + +Example : ``dataset.variables["x"].set_attrval["units", "m s-1")`` + +Rename +------ +A component can be renamed with the :meth:`~ncdata.NameMap.rename` method. This changes +both the name in the container **and** the component's own name -- it is not recommended +ever to set ``component.name`` directly, as this obviously can become inconsistent. + +Example : ``dataset.variables.rename("x", "y")`` + +.. warning:: + Renaming a dimension will not rename references to it (i.e. in variables), which + obviously may cause problems. + We may add a utility to do this safely this in future. + +Copying +------- +All core objects support a ``.copy()`` method, which however does not copy array content +(e.g. variable data or attribute arrays). See for instance :meth:`ncdata.NcData.copy`. + +There is also a utility function :func:`ncdata.utils.ncdata_copy`, this is effectively +the same as the NcData object copy. + + +Equality Checking +----------------- +We provide a simple, comprehensive ``==`` check for :mod:`~ncdata.NcDimension` and +:mod:`~ncdata.NcAttribute` objects, but not at present :mod:`~ncdata.NcVariable` or +:mod:`~ncdata.NcData`. + +So, using ``==`` on :mod:`~ncdata.NcVariable` or :mod:`~ncdata.NcData` objects +will only do an identity check -- that is, it tests ``id(A) == id(B)``, or ``A is B``. + +However, these objects **can** be properly compared with the dataset comparison +utilities, :func:`ncdata.utils.dataset_differences` and +:func:`ncdata.utils.variable_differences` : By default, these operations are very +comprehensive and may be very costly for instance comparing large data arrays, but they +also allow more nuanced and controllable checking, e.g. to skip data array comparisons +or ignore variable ordering. + + +Onject Creation +--------------- +The constructors should allow reasonably readable inline creation of data. +See here : :ref:`data-constructors` + +Ncdata is deliberately not very fussy about 'correctness', since it is not tied to an actual +dataset which must "make sense". see : :ref:`correctness-checks` . + +Hence, there is no great need to install things in the 'right' order (e.g. dimensions +before variables which need them). You can create objects in one go, like this : + +.. code-block:: + + data = NcData( + dimensions=[ + NcDimension("y", 2), + NcDimension("x", 3), + ], + variables=[ + NcVariable("y", dimensions=["y"], data=[10, 11]), + NcVariable("x", dimensions=["y"], data=[20, 21, 22]), + NcVariable("dd", dimensions=["y", "x"], data=[[0, 1, 2], [3, 4, 5]]) + ] + ) + + +or iteratively, like this : + +.. code-block:: + + data = NcData() + dims = [("y", 2), ("x", 3)] + data.variables.addall([ + NcVariable(nn, dimensions=[nn], data=np.arange(ll)) + for ll, nn in dims + ]) + data.variables.add( + NcVariable("dd", dimensions=["y", "x"], + data=np.arange(6).reshape(2,3)) + ) + data.dimensions.addall([NcDimension(nn, ll) for nn, ll in dims]) + +Note : here, the variables were created before the dimensions + + diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index 3c937c3..df605e4 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -265,10 +265,6 @@ will be automatically converted to a NameMap of ``name: NcAttribute(name: value) Relationship to File Storage ---------------------------- -Note that file-specific storage aspects, such as chunking, data-paths or compression -strategies, are not recorded in the core objects. However, array representations in -variable and attribute data (notably dask lazy arrays) may hold such information. -The concept of "unlimited" dimensions is arguably an exception. However, this is a -core provision in the NetCDF data model itself (see "Dimension" in the `NetCDF Classic Data Model`_). +See :ref:`file-storage` .. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 661278a..063a3a8 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -1,118 +1,13 @@ -.. _common_operations: - -Common Operations -================= -A group of common operations are available on all the core component types, -i.e. the operations of extract/remove/insert/rename/copy on the ``.datasets``, ``.groups``, -``.dimensions``, ``.variables`` and ``.attributes`` properties of the core objects. - -Most of these are hopoefully "obvious" Pythonic methods of the container objects. - -Extract and Remove ------------------- -These are implemented as :meth:`~ncdata.NameMap.__delitem__` and :meth:`~ncdata.NameMap.pop` -methods, which work in the usual way. - -Examples : - -* ``var_x = dataset.variables.pop("x")`` -* ``del data.variables["x"]`` - -Insert / Add ------------- -A new content (component) can be added under its own name with the -:meth:`~ncdata.NameMap.add` method. - -Example : ``dataset.variables.add(NcVariable("x", dimensions=["x"], data=my_data))`` - -An :meth:`~ncdata.NcAttribute` can also be added or set (if already present) with the special -:meth:`~ncdata.NameMap.set_attrval` method. - -Example : ``dataset.variables["x"].set_attrval["units", "m s-1")`` - -Rename ------- -A component can be renamed with the :meth:`~ncdata.NameMap.rename` method. This changes -both the name in the container **and** the component's own name -- it is not recommended -ever to set ``component.name`` directly, as this obviously can become inconsistent. - -Example : ``dataset.variables.rename("x", "y")`` - -.. warning:: - Renaming a dimension will not rename references to it (i.e. in variables), which - obviously may cause problems. - We may add a utility to do this safely this in future. - -Copy ----- -All core objects support a ``.copy()`` method, which however does not copy array content -(e.g. variable data or attribute arrays). See for instance :meth:`ncdata.NcData.copy`. - -There is also a utility function :func:`ncdata.utils.ncdata_copy`, this is effectively -the same as the NcData object copy. - - -Creation --------- -The constructors should allow reasonably readable inline creation of data. -See here : :ref:`data-constructors` - -Ncdata is deliberately not very fussy about 'correctness', since it is not tied to an actual -dataset which must "make sense". see : :ref:`correctness-checks` . - -Hence, there is no great need to install things in the 'right' order (e.g. dimensions -before variables which need them). You can create objects in one go, like this : - -.. code-block:: - - data = NcData( - dimensions=[ - NcDimension("y", 2), - NcDimension("x", 3), - ], - variables=[ - NcVariable("y", dimensions=["y"], data=[10, 11]), - NcVariable("x", dimensions=["y"], data=[20, 21, 22]), - NcVariable("dd", dimensions=["y", "x"], data=[[0, 1, 2], [3, 4, 5]]) - ] - ) - - -or iteratively, like this : - -.. code-block:: - - data = NcData() - dims = [("y", 2), ("x", 3)] - data.variables.addall([ - NcVariable(nn, dimensions=[nn], data=np.arange(ll)) - for ll, nn in dims - ]) - data.variables.add( - NcVariable("dd", dimensions=["y", "x"], - data=np.arange(6).reshape(2,3)) - ) - data.dimensions.addall([NcDimension(nn, ll) for nn, ll in dims]) - -Note : here, the variables were created before the dimensions - - -Equality Checks ---------------- -We provide a simple ``==`` check for all the core objects but this can be very costly, -at least for variables, because it will check all the data, even in lazy arrays (!). - -You can use :func:`ncdata.utils.dataset_differences` for much more nuanced and controllable -checking. +.. _general_topics: +General Topics +============== +Odd discussion topics realting to core ncdata classes + data management Validity Checking ----------------- See : :ref:`correctness-checks` -General Topics -============== -Odd discussion topics .. _data-types: @@ -123,8 +18,8 @@ all use a subset of numpy **dtypes**, compatible with netcdf datatypes. These are effectively those defined by `netcdf4-python `_, and this therefore also effectively determines what we see in `dask arrays `_ . -However, at present ncdata directly supports only the `NetCDF Classic Data Model`_ (plus groups, -see : :ref:`data-model`). +However, at present ncdata directly supports only the so-called "Primitive Types" of the +`NetCDF Enhanced Data Model`_ : :ref:`data-model`. So, this does ***not*** include the user-defined, enumerated or variable-length datatypes. .. attention:: @@ -134,67 +29,67 @@ So, this does ***not*** include the user-defined, enumerated or variable-length We hope to extend support to the more general `NetCDF Enhanced Data Model`_ in future. -As-of January 2025 there is +For reference, the currently supported + tested datatypes are currently : + +* unsigned byte = numpy "u1" +* unsigned short = numpy "u2" +* unsigned int = numpy "u4" +* unsigned long = numpy "u4" +* byte = numpy "i1" +* short = numpy "i2" +* int = numpy "i4" +* long = numpy "i8" +* float = numpy "f4" +* double = numpy "f8" +* char = numpy "U1" .. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model .. _NetCDF Enhanced Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model -.. _character-data: +.. _string-and-character-data: -Character Data --------------- +Character and String Data Handling +---------------------------------- NetCDF can can contain string and character data in at least 3 different contexts : -1. in variable data arrays -2. in attribute values -3. in names of components (i.e. dimensions / variables / attributes / groups ) +Characters in Data Component Names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(i.e. groups, variables, attributes or dimensions) -The first case (3.) is, effectively, quite separate. -Since NetCDF version 4, the names of items within files are fully unicode compliant and can -use virtually ***any*** characters, with the exception of the forward slash "/" +Since NetCDF version 4, the names of components within files are fully unicode compliant +and can use virtually ***any*** characters, with the exception of the forward slash "/" ( since in some technical cases a component name needs to specified as a "path-like" compound ) -.. _thread-safety: +Characters in Variable Data +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Character data in variable *data* arrays are generally stored as fixed-length arrays of +characters (i.e. fixed-width strings), and no unicode interpretation is applied by the +libraries (neither netCDF4 or ncdata). In this case, the strings appear in Python as +numpy character arrays of dtype "`_), that was loaded using more than -one of the Iris, Xarray and ncdata.netcdf4 packages, then :mod:`ncdata.threadlock_sharing` must be used to avoid -possible errors. - -A Fuller Explanation.. -^^^^^^^^^^^^^^^^^^^^^^ -In practice, Iris, Xarray and Ncdata are all capable of scanning netCDF files and interpreting -their metadata, while **not** reading all the core variable data contained in them. - -The file load generates `Dask.arrray `_ objects representing sections of -variable data for calculation on later request, with certain key benefits : - -1. no data loading or calculation happens until needed -2. the work is divided into sectional 'tasks', of which only some may ultimately be needed -3. it may be possible to perform multiple sections of calculation (including data fetch) in parallel -4. it may be possible to localise operations (fetch or calculate) near to data distributed across a cluster - -Usually, the most efficient parallelisation of array operations is by multi-threading, -since that can use memory sharing of large data arrays in memory. However, the python netCDF4 library is **not threadsafe**, -therefore the "netcdf fetch" call in each input operation must be guarded by a mutex. - -So Xarray, Iris and ncdata all create data objects with Dask arrays, which reference input data chunks fetching sections -of the input files. Each of those uses a mutex to stop it accessing the netCDF4 interface at the same time as -any of the others. - -This works beautifully **until** ncdata connects lazy data loaded with Iris (say) with lazy data loaded from Xarray, -which unfortunately are using their own *separate* mutexes to protect the *same* netcdf library. Then, when we attempt -to calculate or save this result, we may get sporadic and unpredictable system-level errors, even a core-dump. - -So, the function of :mod:`ncdata.threadlock_sharing` is to **connect** the thread-locking schemes of the separate libraries, -so that they cannot accidentally overlap an access call from the other package in a different thread, -just as they already cannot overlap one of their own. - -.. _dask-array: https://docs.dask.org/en/stable/array.html -.. _dask-compute: https://docs.dask.org/en/latest/generated/dask.array.Array.compute.html \ No newline at end of file +Whenever you combine variable data loaded using more than **one** data-format package +(i.e. at present, Iris and Xarray and Ncdata itself), you can potentially get +multi-threading contention errors in netCDF4 library access. This may result in +problems ranging from sporadic value changes to a segmentation faults or other system +errors. + +In these cases you should always to use the :mod:`ncdata.threadlock_sharing` module to +avoid such problems. See :ref:`thread-safety`. diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst index f58a700..87b32f4 100644 --- a/docs/userdocs/user_guide/howtos.rst +++ b/docs/userdocs/user_guide/howtos.rst @@ -274,6 +274,35 @@ It can be freely overwritten by the user. valid dimensions, and that ``.data`` arrays match the dimensions. +Read data from a NetCDF file +---------------------------- +Use the :func:`ncdata.netcdf4.from_nc4` function to load a dataset from a netCDF file. + +.. code-block:: python + + >>> from ncdata.netcdf4 from_nc4 + >>> ds = from_nc4(filepath) + >>> print(ds) + + + +Control chunking in a netCDF read +--------------------------------- +Use the ``dim_chunks`` argument in the :func:`ncdata.netcdf4.from_nc4` function + +.. code-block:: python + + >>> from ncdata.netcdf4 from_nc4 + >>> ds = from_nc4(filepath, dim_chunks={"time": 3}) + >>> print(ds.variables["x"].data.chunksize) + (3,) + Save data to a new file ----------------------- diff --git a/docs/userdocs/user_guide/user_guide.rst b/docs/userdocs/user_guide/user_guide.rst index 65500da..12b6cbe 100644 --- a/docs/userdocs/user_guide/user_guide.rst +++ b/docs/userdocs/user_guide/user_guide.rst @@ -1,25 +1,13 @@ User Documentation ================== -Beyond the basic introduction +Detailed explanations, beyond the basic tutorial-style introductions (for which, see :ref:`getting_started`) -.. warning:: - The User Guide is still very incomplete. - -The User Guide is still mostly work-in-progress. -For the present, please see the following : - - * :ref:`Introduction ` - * tested `example scripts in the project repo `_ - * example code snippets in the `project README `_ - - .. toctree:: :maxdepth: 2 design_principles data_objects + common_operations general_topics howtos - known_issues - ../../change_log From 8b3c52a4ef18ff7d6bfed7ca74c74e6f3281a94d Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 25 Jan 2025 00:36:21 +0000 Subject: [PATCH 03/25] More fixes to correctness, consistency, readability. Add example for string data fix. --- docs/details/interface_support.rst | 16 +---- .../userdocs/getting_started/introduction.rst | 6 +- docs/userdocs/user_guide/data_objects.rst | 59 +++++++++---------- docs/userdocs/user_guide/general_topics.rst | 18 +++++- docs/userdocs/user_guide/howtos.rst | 38 ++++++++++++ 5 files changed, 88 insertions(+), 49 deletions(-) diff --git a/docs/details/interface_support.rst b/docs/details/interface_support.rst index f1cd53d..e3ce880 100644 --- a/docs/details/interface_support.rst +++ b/docs/details/interface_support.rst @@ -14,17 +14,7 @@ Datatypes ^^^^^^^^^ Ncdata supports all the regular datatypes of netcdf, but *not* the variable-length and user-defined datatypes. - -This means, notably, that all string variables will have the basic numpy type -'S1', equivalent to netcdf 'NC_CHAR'. Thus, multi-character string variables -must always have a definite "string-length" dimension. - -Attribute values, by contrast, are treated as Python strings with the normal -variable length support. Their basic dtype can be any numpy string dtype, -but will be converted when required. - -The NetCDF C library and netCDF4-python do not support arrays of strings in -attributes, so neither does NcData. +Please see : :ref:`data-types`. Data Scaling, Masking and Compression @@ -45,7 +35,7 @@ control the data compression and translation facilities of the NetCDF file library. If required, you should use :mod:`iris` or :mod:`xarray` for this. -Although file-specific storage aspects, such as chunking, data-paths or compression +File-specific storage aspects, such as chunking, data-paths or compression strategies, are not recorded in the core objects. However, array representations in variable and attribute data (notably dask lazy arrays) may hold such information. @@ -58,7 +48,7 @@ Dask chunking control Loading from netcdf files generates variables whose data arrays are all Dask lazy arrays. These are created with the "chunks='auto'" setting. -There is simple user override API available to control this on a per-dimension basis. +However there is a simple per-dimension chunking control available on loading. See :func:`ncdata.netcdf4.from_nc4`. diff --git a/docs/userdocs/getting_started/introduction.rst b/docs/userdocs/getting_started/introduction.rst index fec4b00..f7dec43 100644 --- a/docs/userdocs/getting_started/introduction.rst +++ b/docs/userdocs/getting_started/introduction.rst @@ -21,7 +21,7 @@ The following code snippets demonstrate the absolute basics. Likewise, internal consistency is not checked, so it is possible to create data that cannot be stored in an actual file. - See :func:`ncdata.utils.save_errors`. + See :ref:`correctness-checks`. We may revisit this in later releases to make data manipulation "safer". @@ -109,7 +109,7 @@ which behaves like a dictionary:: Attributes ^^^^^^^^^^ Variables live in the ``attributes`` property of a :class:`~ncdata.NcData` -or :class:`~ncdata.Variable`:: +or :class:`~ncdata.NcVariable`:: >>> var.set_attrval('a', 1) NcAttribute('a', 1) @@ -249,7 +249,7 @@ Thread safety >>> from ndata.threadlock_sharing import enable_lockshare >>> enable_lockshare(iris=True, xarray=True) - See details at :mod:`ncdata.threadlock_sharing` + See details at :ref:`thread_safety`. Working with NetCDF files diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index df605e4..f794895 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -8,7 +8,8 @@ inspect and/or modify it, aiming to do this is the most natural and pythonic way Data Classes ------------ The data model components are elements of the -`NetCDF Classic Data Model`_ , plus **groups** (from the 'enhanced' netCDF model). +`NetCDF Classic Data Model`_ , plus **groups** (from the +`"enhanced" netCDF data model `_ ). That is, a Dataset(File) consists of just Dimensions, Variables, Attributes and Groups. @@ -87,28 +88,25 @@ Attribute Values In actual netCDF data, the value of an attribute is effectively limited to a one-dimensional array of certain valid netCDF types, and one-element arrays are exactly equivalent to scalar values. -In ncdata, the ``.value`` of an :class:`ncdata.NcAttribute` must always be a numpy array, and -when creating one the provided ``.value`` is cast with :func:`numpy.asanyarray`. +The ``.value`` of an :class:`ncdata.NcAttribute` must always be a numpy scalar or 1-dimensional array. -However you are not prevented from setting an attributes ``.value`` to something other than -an array, which may cause an error. So for now, if setting the value of an existing attribute, -ensure you always write compatible numpy data, or use :meth:`ncdata.NameMap.set_attrval` which is safe. +When assigning a ``.value``, or creating a new :class:`ncdata.NcAttribute`, the value +is cast with :func:`numpy.asanyarray`, and if this fails, or yields a multidimensional array +then an error is raised. -For *reading* attributes, it is best to use :meth:`ncdata.NameMap.get_attrval` or (equivalently) -:meth:`ncdata.NcAttribute.as_python_value()` : These consistently return either -``None`` (if missing); a numpy scalar; or array; or a Python string. Those results are -intended to be equivalent to what you should get from storing in an actual file and reading back, +When *reading* attributes, for consistent results it is best to use the +:meth:`ncdata.NcVariable.get_attrval` method or (equivalently) :meth:`ncdata.NcAttribute.as_python_value` : +These return either ``None`` (if missing); a numpy scalar; or array; or a Python string. +These are intended to be equivalent to what you would get from storing in an actual file and reading back, including re-interpreting a length-one vector as a scalar value. .. attention:: - The correct handling and (future) discrimination of string data as character arrays ("char" in netCDF terms) - and/or variable-length strings ("string" type) is still to be determined. + The correct handling and (future) discrimination of attribute values which are character arrays + ("char" in netCDF terms) and/or variable-length strings ("string" type) is still to be determined. + ( We do not yet properly support any variable-length types. ) - For now, we are converting **all** string attributes to python strings. - - There is **also** a longstanding known problem with the low-level C (and FORTRAN) interface, which forbids the - creation of vector character attributes, which appear as single concatenated strings. So for now, **all** - string-type attributes appear as single Python strings (you never get an array of strings or list of strings). + For now, we are simply converting **all** string-like attributes by + :meth:`ncdata.NcAttribute.as_python_value` to python strings. See also : :ref:`data-types` @@ -116,21 +114,21 @@ See also : :ref:`data-types` Correctness and Consistency --------------------------- -In practice, to support flexibility in construction and manipulation, it is -not practical for ncdata structures to represent valid netCDF at -all times, since this would makes changing things awkward. -For example, if a group refers to a dimension *outside* the group, you could not simply -extract it from the dataset because it is not valid in isolation. - -Thus, we do allow that ncdata structures represent *invalid* netCDF data. +In order to allow flexibility in construction and manipulation, it is not practical +for ncdata structures to represent valid netCDF at all times, since this would makes +changing things awkward. +For example, if a group refers to a dimension *outside* the group, strict correctness +would not allow you to simply extract it from the dataset, because it is not valid in isolation. +Thus, we do allow ncdata structures to represent *invalid* netCDF data. For example, circular references, missing dimensions or naming mismatches. -Effectively there are a set of data validity rules, which are summarised in the -:func:`ncdata.utils.save_errors` routine. -In practice, there a minimal set of runtime rules for creating ncdata objects, and -additional requirements when ncdata is converted to actual netCDF. For example, -variables can be initially created with no data. But if subsequently written to a file, -data must be assigned first. +In practice, there are a minimal set of rules which apply when initially creating +ncdata objects, and additional requirements which apply when creating actual netCDF files. +For example, a variable can be initially created with no data. But if subsequently written +to a file, some data must be defined. + +The full set of data validity rules are summarised in the +:func:`ncdata.utils.save_errors` routine. .. Note:: These issues are not necessarily all fully resolved. Caution required ! @@ -268,3 +266,4 @@ Relationship to File Storage See :ref:`file-storage` .. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model +.. _NetCDF Enhanced Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model \ No newline at end of file diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 063a3a8..2351636 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -2,7 +2,7 @@ General Topics ============== -Odd discussion topics realting to core ncdata classes + data management +Odd discussion topics relating to core ncdata classes + data management Validity Checking ----------------- @@ -72,16 +72,28 @@ may contain zero bytes so that they convert to variable-width (Python) strings u maximum width. The string (maximum) length is a separate dimension, which is recorded as a normal netCDF dimension like any other. +.. note:: + + Although it is not tested, it has proved possible (and useful) at present to load + files with variables containing variable-length string data, but it is + necessary to supply an explicit user chunking to workaround limitations in Dask. + Please see the :ref:`howto example `. + Characters in Attribute Values ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Character data in string *attribute* values can be written simply as Python strings. They are stored in an :class:`~ncdata.NcAttribute`'s ``.value`` as a character array of dtype ">> var.set_attrval("_FillValue", -9999) >>> to_nc4(ncdata, "output.nc") + +.. _howto_load_variablewidth_strings: + +Load a file containing variable-width string variables +------------------------------------------------------ +You must supply a ``dim_chunks`` keyword to the :meth:`ncdata.netcdf.from_nc4` method, +specifying how to chunk the dimension(s) which the string variable uses. + +.. code-block:: python + + >>> from ncdata.netcdf4 import from_nc4 + >>> # if we have a "string" type variable using the "date" dimension + >>> # : don't chunk that dimension. + >>> dataset = from_nc4(filepath, dim_chunks={"date": -1}) + +This is needed to avoid a Dask error like +``"auto-chunking with dtype.itemsize == 0 is not supported, please pass in `chunks` +explicitly."`` + +When you have done this, Dask will return the variable data as a numpy *object* array containing Python strings. +You probably still need to (manually) convert that to something more tractable to work with it effectively. + +For example, something like : + +.. code-block:: python + + >>> var = dataset.variables['name'] + >>> data = var.data.compute() + >>> maxlen = max(len(s) for s in var.data) + + >>> # convert to fixed-width character array + >>> data = np.array([[s.ljust(maxlen, "\0") for s in var.data]]) + >>> print(data.shape, data.dtype) + (1010, 12) >> dataset.dimensions.add(NcDimension('name_strlen', maxlen)) + >>> var.dimensions = var.dimensions + ("name_strlen",) + >>> var.data = data From 0e8316529ab1f67c9a11885bf82373444608a396 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 12:47:17 +0000 Subject: [PATCH 04/25] Overhaul all API docstrings. --- docs/userdocs/user_guide/data_objects.rst | 4 +- docs/userdocs/user_guide/howtos.rst | 11 +++-- lib/ncdata/_core.py | 10 ++-- lib/ncdata/dataset_like.py | 47 +++++++++++-------- lib/ncdata/iris.py | 23 +++++----- lib/ncdata/iris_xarray.py | 12 ++--- lib/ncdata/netcdf4.py | 5 ++ lib/ncdata/threadlock_sharing.py | 27 +++++++---- lib/ncdata/utils/_compare_nc_datasets.py | 56 ++++++++++++++++++----- lib/ncdata/xarray.py | 21 ++++++--- 10 files changed, 140 insertions(+), 76 deletions(-) diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index f794895..bd811cc 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -143,7 +143,7 @@ i.e. either its ``.dimensions``, ``.variables``, ``.attributes`` or ``.groups``. Each of the "container" properties is a :class:`~ncdata._core.NameMap` object, which is a dictionary type mapping a string (name) to a specific type of components. -The dictionary``.keys()`` are a sequence of component names, and its ``.values()`` are +The dictionary ``.keys()`` are a sequence of component names, and its ``.values()`` are the corresponding contained components. Every component object also has a ``.name`` property. By this, it is implicit that you @@ -180,6 +180,8 @@ Or, likewise, to ***set*** values, one of but **not** ``dataset.variables['x'].attributes['units'].value = "K"`` +.. _container-ordering: + Container ordering ------------------ The order of elements of a container is technically significant, and does constitute a diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst index 6583113..7f014c4 100644 --- a/docs/userdocs/user_guide/howtos.rst +++ b/docs/userdocs/user_guide/howtos.rst @@ -558,12 +558,12 @@ or, to convert xarray data variable output to masked integers : Load a file containing variable-width string variables ------------------------------------------------------ You must supply a ``dim_chunks`` keyword to the :meth:`ncdata.netcdf.from_nc4` method, -specifying how to chunk the dimension(s) which the string variable uses. +specifying how to chunk all dimension(s) which the "string" type variable uses. .. code-block:: python >>> from ncdata.netcdf4 import from_nc4 - >>> # if we have a "string" type variable using the "date" dimension + >>> # This file has a netcdf "string" type variable, with dimensions ('date',). >>> # : don't chunk that dimension. >>> dataset = from_nc4(filepath, dim_chunks={"date": -1}) @@ -571,10 +571,11 @@ This is needed to avoid a Dask error like ``"auto-chunking with dtype.itemsize == 0 is not supported, please pass in `chunks` explicitly."`` -When you have done this, Dask will return the variable data as a numpy *object* array containing Python strings. -You probably still need to (manually) convert that to something more tractable to work with it effectively. +When you do this, Dask returns the variable data as a numpy *object* array, containing +Python strings. You will probably also want to (manually) convert that to something +more tractable, to work with it effectively. -For example, something like : +For example, something like this : .. code-block:: python diff --git a/lib/ncdata/_core.py b/lib/ncdata/_core.py index 71bc06d..dd61e5d 100644 --- a/lib/ncdata/_core.py +++ b/lib/ncdata/_core.py @@ -497,12 +497,10 @@ def copy(self): class NcAttribute: """ - An object representing a netcdf variable or dataset attribute. + An object representing a netcdf variable, group or dataset attribute. - Associates a name to a value which is a numpy scalar or 1-D array. - - We expect the value to be 0- or 1-dimensional, and an allowed dtype. - However none of this is checked. + Associates a name to a value which is always a numpy scalar or 1-D array, of an + allowed dtype. See :ref:`attribute-dtypes`. In an actual netcdf dataset, a "scalar" is actually just an array of length 1. """ @@ -511,7 +509,7 @@ def __init__(self, name: str, value): # noqa: D107 #: attribute name self.name: str = name # Attribute values are arraylike, have dtype - #: attribute value + #: attribute value, constrained to a suitable numpy array object. self.value: np.ndarray = value @property diff --git a/lib/ncdata/dataset_like.py b/lib/ncdata/dataset_like.py index 04ca62a..82b7383 100644 --- a/lib/ncdata/dataset_like.py +++ b/lib/ncdata/dataset_like.py @@ -1,29 +1,32 @@ r""" -An adaptor layer making a NcData appear like a :class:`netCDF4.Dataset`. +An adaptor layer for :mod:`ncdata` to emulate :mod:`netCDF4`. -Allows an :class:`~ncdata.NcData` to masquerade as a +Primarily, allows an :class:`ncdata.NcData` to masquerade as a :class:`netCDF4.Dataset` object. Note: This is a low-level interface, exposed publicly for extended experimental uses. - If you only want to convert **Iris** data to+from :class:`~ncdata.NcData`, + If you only want to convert **Iris** data to + from :class:`~ncdata.NcData`, please use the functions in :mod:`ncdata.iris` instead. ---- -These classes contain :class:`~ncdata.NcData` and :class:`~ncdata.NcVariable`\\s, but -emulate the access APIs of a :class:`netCDF4.Dataset` / :class:`netCDF4.Variable`. +These classes contain :class:`~ncdata.NcData`, :class:`~ncdata.NcDimension`, and +:class:`~ncdata.NcVariable` objects, but emulate the access APIs of +:class:`netCDF4.Dataset` :class:`netCDF4.Dimension` and :class:`netCDF4.Variable`. This is provided primarily to support a re-use of the :mod:`iris.fileformats.netcdf` -file format load + save, to convert cubes to+from ncdata objects (and hence, especially, -convert Iris :class:`~iris.cube.Cube`\\s to+from an Xarray :class:`~xarray.Dataset`). +file format load + save, to convert cubes to + from ncdata objects (and hence, +especially, to convert Iris :class:`~iris.cube.Cube`\s to + from an Xarray +:class:`~xarray.Dataset` +). Notes ----- Currently only supports what is required for Iris load/save capability. -It *should* be possible to use these objects with other packages expecting a -:class:`netCDF4.Dataset` object, however the API simulation is far from complete, so -this may need to be extended, in future, to support other such uses. +In principle, it *should* be possible to use these objects with other packages +expecting a :class:`netCDF4.Dataset` object. However the API simulation is far from +complete, so this module may need to be extended, in future, to support other such uses. """ from typing import Any, Dict, List @@ -85,7 +88,7 @@ class Nc4DatasetLike(_Nc4DatalikeWithNcattrs): It can be both read and written (modified) via its emulated :class:`netCDF4.Dataset`-like API. - The core NcData content, 'self._ncdata', is a :class:`ncdata.NcData`. + The core, contained content object, ``self._ncdata``, is a :class:`ncdata.NcData`. This completely defines the parent object state. If not provided on init, a new, empty dataset is created. @@ -97,7 +100,7 @@ class Nc4DatasetLike(_Nc4DatalikeWithNcattrs): file_format = "NETCDF4" def __init__(self, ncdata: NcData = None): - """Create an Nc4DatasetLike, wrapping an NcData.""" + """Create an Nc4DatasetLike, wrapping an :class:`~ncdata.NcData`.""" if ncdata is None: ncdata = NcData() # an empty dataset #: the contained dataset. If not provided, a new, empty dataset is created. @@ -195,18 +198,24 @@ class Nc4VariableLike(_Nc4DatalikeWithNcattrs): """ An object which contains a :class:`ncdata.NcVariable` and emulates a :class:`netCDF4.Variable`. - The core NcData content, 'self._ncdata', is a :class:`NcVariable`. + The core, contained content object, ``self._ncdata``, is a :class:`~ncdata.NcVariable`. This completely defines the parent object state. - The property "_data_array" is detected by Iris to do direct data transfer + The property ``._data_array`` is detected by Iris to do direct data transfer (copy-free and lazy-preserving). + At present, this object emulates only the *default* read/write behaviour of a - netCDF4 Variable, i.e. the underlying NcVariable contains a 'raw' data array, and - the _data_array property interface applies/removes any scaling and masking as it is - "seen" from the outside. + :class:`netCDF4.Variable`, i.e. : + + * the underlying NcVariable contains a 'raw' data array, which may be real + (i.e. numpy) or lazy (i.e. dask). + * The ``._data_array`` property read/write interface then applies/removes any + scaling and masking as it is to be "seen" from the outside. + That suits how *Iris* reads netCFD4 data, but it won't work if the user wants to control the masking/saving behaviour, as you can do in netCDF4. - Thus, at present, we do *not* provide any of set_auto_mask/scale/maskandscale. + Thus, at present, we do *not* provide any of the + ``set_auto_mask/scale/maskandscale()`` methods. """ @@ -447,7 +456,7 @@ class Nc4DimensionLike: """ An object which emulates a :class:`netCDF4.Dimension` object. - The core NcData content, 'self._ncdata', is a :class:`ncdata.NcDimension`. + The core, contained content object, ``self._ncdata``, is a :class:`ncdata.NcDimension`. This completely defines the parent object state. """ diff --git a/lib/ncdata/iris.py b/lib/ncdata/iris.py index 0db0bae..cb94a67 100644 --- a/lib/ncdata/iris.py +++ b/lib/ncdata/iris.py @@ -1,13 +1,18 @@ r""" Interface routines for converting data between ncdata and Iris. -Convert :class:`~ncdata.NcData` to and from Iris :class:`~iris.cube.Cube`\\s. - -This uses the :class:`ncdata.dataset_like` interface ability to mimic netCDF4.Dataset -objects, which are used like files to load and save Iris data. -This means that all we need to know of Iris is its netcdf load+save interfaces. +Convert :class:`~ncdata.NcData`\s to and from Iris :class:`~iris.cube.Cube`\s. """ +# +# NOTE: This uses the :mod:`ncdata.dataset_like` interface ability to mimic a +# :class:`netCDF4.Dataset` object, which can then be loaded like a file into Iris. +# The Iris netcdf loader now has specific support for loading an open dataset object, +# see : https://github.com/SciTools/iris/pull/5214. +# This means that, hopefully, all we need to know of Iris itself is the load and save, +# though we do specifically target the netcdf format interface. +# + from typing import Any, AnyStr, Dict, Iterable, Union import iris @@ -19,10 +24,6 @@ __all__ = ["from_iris", "to_iris"] -# -# The primary conversion interfaces -# - def to_iris(ncdata: NcData, **iris_load_kwargs: Dict[AnyStr, Any]) -> CubeList: """ @@ -40,7 +41,7 @@ def to_iris(ncdata: NcData, **iris_load_kwargs: Dict[AnyStr, Any]) -> CubeList: Returns ------- - cubes : CubeList + cubes : iris.cube.CubeList loaded results """ dslike = Nc4DatasetLike(ncdata) @@ -61,7 +62,7 @@ def from_iris( cubes : :class:`iris.cube.Cube`, or iterable of Cubes cube or cubes to "save" to an NcData object. iris_save_kwargs : dict - additional keys passed to :func:`iris.save` operation. + additional keys passed to :func:`iris.fileformats.netcdf.save` operation. Returns ------- diff --git a/lib/ncdata/iris_xarray.py b/lib/ncdata/iris_xarray.py index 9a6a288..0a6891e 100644 --- a/lib/ncdata/iris_xarray.py +++ b/lib/ncdata/iris_xarray.py @@ -1,11 +1,12 @@ r""" Interface routines for converting data between Xarray and Iris. -Convert :class:`~xarray.Dataset` to and from Iris :class:`~iris.cube.Cube`\\s. +Convert :class:`~xarray.Dataset`\s to and from Iris :class:`~iris.cube.Cube`\s. By design, these transformations should be equivalent to saving data from one package -to a netcdf file, and re-loading into the other package. There is also support for -passing additional keywords to the appropriate load/save routines. +to a netcdf file, and re-loading into the other package. But without actually saving +or loading data, of course. There is also support for passing additional keywords to +the relevant load/save routines. """ import xarray @@ -57,7 +58,7 @@ def cubes_to_xarray( cubes, iris_save_kwargs=None, xr_load_kwargs=None ) -> xarray.Dataset: r""" - Convert Iris :class:`iris.cube.Cube`\\s to an xarray :class:`xarray.Dataset`. + Convert Iris :class:`iris.cube.Cube`\s to an xarray :class:`xarray.Dataset`. Equivalent to saving the dataset to a netcdf file, and loading that with Xarray. @@ -70,8 +71,7 @@ def cubes_to_xarray( source data iris_save_kwargs : dict - additional keywords passed to :func:`iris.save`, and to - :func:`iris.fileformats.netcdf.saver.save` + additional keywords passed to :func:`iris.fileformats.netcdf.save`. xr_load_kwargs : dict additional keywords passed to :meth:`xarray.Dataset.load_store` diff --git a/lib/ncdata/netcdf4.py b/lib/ncdata/netcdf4.py index 8c77ff2..5ddef22 100644 --- a/lib/ncdata/netcdf4.py +++ b/lib/ncdata/netcdf4.py @@ -318,6 +318,11 @@ def from_nc4( (160, 15) >>> + See also : :ref:`howto_load_variablewidth_strings` : This illustrates a particular + case which **does** encounter an error with dask "auto" chunking, and therefore + also fails with a plain "from_nc4" call. The ``dim_chunks`` keyword enables you to + work around the problem. + """ if dim_chunks is None: dim_chunks = {} diff --git a/lib/ncdata/threadlock_sharing.py b/lib/ncdata/threadlock_sharing.py index c29dbe5..4c8bfdb 100644 --- a/lib/ncdata/threadlock_sharing.py +++ b/lib/ncdata/threadlock_sharing.py @@ -8,8 +8,9 @@ Most commonly, this occurs when netcdf file data is read to compute a Dask array, or written in a Dask delayed write operation. -All 3 data-format packages can map variable data into Dask lazy arrays. Iris and -Xarray can also create delayed write operations (but ncdata currently does not). +All 3 data-format packages (ncdata, Iris and xarray) can map variable data into Dask +lazy arrays on file load. Iris and Xarray can also create delayed write operations +(but ncdata currently does not). However, those mechanisms cannot protect an operation of that package from overlapping with one in *another* package. @@ -17,12 +18,12 @@ This module can ensure that all of the enabled packages use the *same* thread lock, so that any and all of them can safely co-operate in parallel operations. -sample code:: +sample usages:: from ncdata.threadlock_sharing import enable_lockshare, disable_lockshare from ncdata.xarray import from_xarray - from ncdata.iris import from_iris - from ncdata.netcdf4 import to_nc4 + from ncdata.iris import from_iris, to_iris + from ncdata.netcdf4 import to_nc4, from_nc4 enable_lockshare(iris=True, xarray=True) @@ -36,10 +37,16 @@ or:: with lockshare_context(iris=True): - ncdata = NcData(source_filepath) - ncdata.variables['x'].attributes['units'] = 'K' - cubes = ncdata.iris.to_iris(ncdata) - iris.save(cubes, output_filepath) + ncdata = from_nc4(source_filepath) + my_adjust_process(ncdata) + data_cube = to_iris(ncdata).extract("main_var") + grid_cube = iris.load_cube(grid_path, "grid_cube") + result_cube = data_cube.regrid(grid_cube) + iris.save(result_cube, output_filepath) + +.. NOTE:: + This solution is at present still experimental, and not itself fully thread-safe, + so probably only suitable for top-level global application. """ from contextlib import contextmanager @@ -69,7 +76,7 @@ def enable_lockshare(iris: bool = False, xarray: bool = False): Notes ----- - If an 'enable_lockshare' call was already established, the function does nothing, + If an ``enable_lockshare`` call was already established, the function does nothing, i.e. it is not possible to modify an existing share. Instead, you must call :func:`disable_lockshare` to cancel the current sharing, before you can establish a new one. diff --git a/lib/ncdata/utils/_compare_nc_datasets.py b/lib/ncdata/utils/_compare_nc_datasets.py index 655babf..fe3e469 100644 --- a/lib/ncdata/utils/_compare_nc_datasets.py +++ b/lib/ncdata/utils/_compare_nc_datasets.py @@ -32,30 +32,60 @@ def dataset_differences( suppress_warnings: bool = False, ) -> List[str]: r""" - Compare netcdf data objects. + Compare two netcdf datasets. - Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\\s or :class:`NcData` objects. + Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\s or :class:`NcData` + objects. File paths are opened with :mod:`netCDF4`. Parameters ---------- - dataset_or_path_1, dataset_or_path_2 : str or Path or netCDF4.Dataset or NcData - two datasets to compare, either NcData or netCDF4 - check_dims_order, check_vars_order, check_attrs_order, check_groups_order : bool, default True - If False, no error results from the same contents in a different order, - however unless `suppress_warnings` is True, the error string is issued as a warning. - check_names: bool, default False + dataset_or_path_1 : str or Path or netCDF4.Dataset or NcData + First dataset to compare : either an open :class:`netCDF4.Dataset`, a path to + open one, or an :class:`~ncdata.NcData` object. + + dataset_or_path_2 : str or Path or netCDF4.Dataset or NcData + Second dataset to compare : either an open :class:`netCDF4.Dataset`, a path to + open one, or an :class:`~ncdata.NcData` object. + + check_dims_order : bool, default True + If False, no error results from the same dimensions appearing in a different + order. However, unless `suppress_warnings` is True, the error string is issued + as a warning. + + check_vars_order : bool, default True + If False, no error results from the same variables appearing in a different + order. However unless `suppress_warnings` is True, the error string is issued + as a warning. + + check_attrs_order : bool, default True + If False, no error results from the same attributes appearing in a different + order. However unless `suppress_warnings` is True, the error string is issued + as a warning. + + check_groups_order : bool, default True + If False, no error results from the same groups appearing in a different order. + However unless `suppress_warnings` is True, the error string is issued as a + warning. + + check_names : bool, default False Whether to warn if the names of the top-level datasets are different - check_dims_unlimited: bool, default True + + check_dims_unlimited : bool, default True Whether to compare the 'unlimited' status of dimensions + check_var_data : bool, default True If True, all variable data is also checked for equality. If False, only dtype and shape are compared. - NOTE: comparison of large arrays is done in-memory, so may be highly inefficient. - show_n_first_different: int, default 2 + NOTE: comparison of arrays is done in-memory, so could be highly inefficient + for large variable data. + + show_n_first_different : int, default 2 Number of value differences to display. + suppress_warnings : bool, default False When False (the default), report changes in content order as Warnings. When True, ignore changes in ordering. + See also : :ref:`container-ordering`. Returns ------- @@ -68,6 +98,7 @@ def dataset_differences( ds2_was_path = not hasattr(dataset_or_path_2, "variables") ds1, ds2 = None, None try: + # convert path-likes to netCDF4.Dataset if ds1_was_path: ds1 = nc.Dataset(dataset_or_path_1) else: @@ -78,6 +109,9 @@ def dataset_differences( else: ds2 = dataset_or_path_2 + # NOTE: Both ds1 and ds2 are now *either* NcData *or* netCDF4.Dataset + # _isncdata() will be used to distinguish. + errs = _group_differences( ds1, ds2, diff --git a/lib/ncdata/xarray.py b/lib/ncdata/xarray.py index bc8e876..ecdd9d5 100644 --- a/lib/ncdata/xarray.py +++ b/lib/ncdata/xarray.py @@ -1,12 +1,15 @@ -""" +r""" Interface routines for converting data between ncdata and xarray. -Converts :class:`~ncdata.NcData` to and from Xarray :class:`~xarray.Dataset` objects. - -This embeds a certain amount of Xarray knowledge (and dependency), hopefully a minimal -amount. The structure of an NcData object makes it fairly painless. +Converts :class:`ncdata.NcData`\s to and from :class:`xarray.Dataset` objects. """ + +# NOTE: This embeds a certain amount of Xarray knowledge (and dependency). +# Hopefully a minimal amount. +# The structure of an NcData object makes it fairly painless. +# + from pathlib import Path from typing import AnyStr, Union @@ -159,12 +162,14 @@ def to_xarray(ncdata: NcData, **xarray_load_kwargs) -> xr.Dataset: """ Convert :class:`~ncdata.NcData` to an xarray :class:`~xarray.Dataset`. + Behaves (ideally, somewhat) like an :func:`xarray.load_dataset` call. + Parameters ---------- ncdata : NcData source data - kwargs : dict + xarray_load_kwargs : dict additional xarray "load keywords", passed to :meth:`xarray.Dataset.load_store` Returns @@ -182,12 +187,14 @@ def from_xarray( """ Convert an xarray :class:`xarray.Dataset` to a :class:`NcData`. + Behaves (ideally, somewhat) like an :meth:`xarray.Dataset.to_netcdf` call. + Parameters ---------- xrds : :class:`xarray.Dataset` source data - kwargs : dict + xarray_save_kwargs : dict additional xarray "save keywords", passed to :meth:`xarray.Dataset.dump_to_store` From dce4b7296fc0af245f6a0ea5cde6b8170828c519 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 17:35:11 +0000 Subject: [PATCH 05/25] Update docs/userdocs/user_guide/data_objects.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/data_objects.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index bd811cc..d737ed6 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -1,7 +1,7 @@ Core Data Objects ================= Ncdata uses Python objects to represent netCDF data, and allows the user to freely -inspect and/or modify it, aiming to do this is the most natural and pythonic way. +inspect and/or modify it, aiming to do this in the most natural and pythonic way. .. _data-model: From de38b8996b48c06bf1e378b355d822354649ac2b Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 17:36:29 +0000 Subject: [PATCH 06/25] Update docs/userdocs/user_guide/data_objects.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/data_objects.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index d737ed6..051537a 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -115,7 +115,7 @@ See also : :ref:`data-types` Correctness and Consistency --------------------------- In order to allow flexibility in construction and manipulation, it is not practical -for ncdata structures to represent valid netCDF at all times, since this would makes +for ncdata structures to represent valid netCDF at all times, since this would make changing things awkward. For example, if a group refers to a dimension *outside* the group, strict correctness would not allow you to simply extract it from the dataset, because it is not valid in isolation. From 10a6beeaf7e9b59a161a6f366feb1ff1b55cf285 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 17:41:05 +0000 Subject: [PATCH 07/25] Update docs/userdocs/user_guide/common_operations.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/common_operations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index da465a2..d53d757 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -6,7 +6,7 @@ A group of common operations are available on all the core component types, i.e. the operations of extract/remove/insert/rename/copy on the ``.datasets``, ``.groups``, ``.dimensions``, ``.variables`` and ``.attributes`` properties of the core objects. -Most of these are hopoefully "obvious" Pythonic methods of the container objects. +Most of these are hopefully "obvious" Pythonic methods of the container objects. Extract and Remove ------------------ From cf7929610daf4e9ee55098d15c724b9196a534f5 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 17:47:23 +0000 Subject: [PATCH 08/25] Update docs/userdocs/user_guide/common_operations.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/common_operations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index d53d757..adc5fe9 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -41,7 +41,7 @@ Example : ``dataset.variables.rename("x", "y")`` .. warning:: Renaming a dimension will not rename references to it (i.e. in variables), which obviously may cause problems. - We may add a utility to do this safely this in future. + We may add a utility to do this safely in future. Copying ------- From 2356d125526ac3e04e9469d699760212b06cc2fb Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 18:09:40 +0000 Subject: [PATCH 09/25] Update docs/userdocs/user_guide/common_operations.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/common_operations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index adc5fe9..a5c225f 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -63,7 +63,7 @@ will only do an identity check -- that is, it tests ``id(A) == id(B)``, or ``A i However, these objects **can** be properly compared with the dataset comparison utilities, :func:`ncdata.utils.dataset_differences` and -:func:`ncdata.utils.variable_differences` : By default, these operations are very +:func:`ncdata.utils.variable_differences`. By default, these operations are very comprehensive and may be very costly for instance comparing large data arrays, but they also allow more nuanced and controllable checking, e.g. to skip data array comparisons or ignore variable ordering. From 872aa19db90d72a69c18ad4eed274e50fb80ae74 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 18:10:03 +0000 Subject: [PATCH 10/25] Update docs/userdocs/user_guide/common_operations.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/common_operations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index a5c225f..91316cd 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -69,7 +69,7 @@ also allow more nuanced and controllable checking, e.g. to skip data array compa or ignore variable ordering. -Onject Creation +Object Creation --------------- The constructors should allow reasonably readable inline creation of data. See here : :ref:`data-constructors` From 33232da6d1a4bbcb002ca8b8486d1683603ababd Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 18:10:31 +0000 Subject: [PATCH 11/25] Update docs/userdocs/user_guide/general_topics.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/general_topics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 2351636..0695028 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -29,7 +29,7 @@ So, this does ***not*** include the user-defined, enumerated or variable-length We hope to extend support to the more general `NetCDF Enhanced Data Model`_ in future. -For reference, the currently supported + tested datatypes are currently : +For reference, the currently supported + tested datatypes are : * unsigned byte = numpy "u1" * unsigned short = numpy "u2" From 28b3ca306b9f99f7687ed1932f1fe0115bd7af0d Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 18:11:39 +0000 Subject: [PATCH 12/25] Update docs/userdocs/user_guide/general_topics.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/general_topics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 0695028..36fc4dc 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -52,7 +52,7 @@ For reference, the currently supported + tested datatypes are : Character and String Data Handling ---------------------------------- -NetCDF can can contain string and character data in at least 3 different contexts : +NetCDF can contain string and character data in at least 3 different contexts : Characters in Data Component Names ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From eea69fb5fe679ff9df74787661c51993684993f9 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 6 Feb 2025 18:12:22 +0000 Subject: [PATCH 13/25] Update docs/userdocs/user_guide/general_topics.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/general_topics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 36fc4dc..9620261 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -87,7 +87,7 @@ character array of dtype " Date: Fri, 7 Feb 2025 21:04:42 +0000 Subject: [PATCH 14/25] Review changes: links, indents, rewording. --- docs/details/known_issues.rst | 2 +- docs/userdocs/user_guide/howtos.rst | 25 +++++++++++++------------ lib/ncdata/threadlock_sharing.py | 6 +++--- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/details/known_issues.rst b/docs/details/known_issues.rst index 2bab1e3..d6071cd 100644 --- a/docs/details/known_issues.rst +++ b/docs/details/known_issues.rst @@ -43,7 +43,7 @@ There are no current plans to address these, but could be considered in future * notably, includes compound and variable-length types * ..and especially **variable-length strings in variables**. - see : :ref:`string-and-character-data`, ref:`data-types` + see : :ref:`string-and-character-data`, :ref:`data-types` Features planned diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst index 7f014c4..c57269e 100644 --- a/docs/userdocs/user_guide/howtos.rst +++ b/docs/userdocs/user_guide/howtos.rst @@ -454,23 +454,24 @@ Remove or rewrite specific attributes Save selected variables to a new file ------------------------------------- -Load input with :func:`ncdata.netcdf4.from_nc4`; use :meth:`ncdata.NameMap.add` to add -selected elements into a new :class:`ncdata.Ncdata`, and then save it -with :func:`ncdata.netcdf4.to_nc4`. +Load an input dataset with :func:`ncdata.netcdf4.from_nc4`; make a new empty dataset +with :class:`~ncdata.NcData`\ (); use ``dataset.dimensions.add()``, +``dataset.variables.add()`` and similar to add/copy selected elements into it; then +save it with :func:`ncdata.netcdf4.to_nc4`. For a simple case with no groups, it could look something like this: .. code-block:: python - >>> input = from_nc4(input_filepath) - >>> output = NcData() + >>> ds_in = from_nc4(input_filepath) + >>> ds_out = NcData() >>> for varname in ('data1', 'data2', 'dimx', 'dimy'): - >>> var = input.variables[varname] - >>> output.variables.add(var) - >>> for name in var.dimensions if name not in output.dimensions: - >>> output.dimensions.add(input.dimensions[dimname]) + >>> var = ds_in.variables[varname] + >>> ds_out.variables.add(var) + >>> for name in var.dimensions if name not in ds_out.dimensions: + >>> ds_out.dimensions.add(ds_in.dimensions[dimname]) ... - >>> to_nc4(output, output_filepath) + >>> to_nc4(ds_out, output_filepath) Sometimes it's simpler to load the input, delete content **not** wanted, then re-save. It's perfectly safe to do that, since the original file will be unaffected. @@ -479,7 +480,7 @@ It's perfectly safe to do that, since the original file will be unaffected. >>> data = from_nc4(input_filepath) >>> for name in ('extra1', 'extra2', 'unwanted'): - >>> del data.variables[varname] + >>> del data.variables[varname] ... >>> del data.dimensions['pressure'] >>> to_nc4(data, output_filepath) @@ -557,7 +558,7 @@ or, to convert xarray data variable output to masked integers : Load a file containing variable-width string variables ------------------------------------------------------ -You must supply a ``dim_chunks`` keyword to the :meth:`ncdata.netcdf.from_nc4` method, +You must supply a ``dim_chunks`` keyword to the :meth:`ncdata.netcdf4.from_nc4` method, specifying how to chunk all dimension(s) which the "string" type variable uses. .. code-block:: python diff --git a/lib/ncdata/threadlock_sharing.py b/lib/ncdata/threadlock_sharing.py index 4c8bfdb..51ef237 100644 --- a/lib/ncdata/threadlock_sharing.py +++ b/lib/ncdata/threadlock_sharing.py @@ -44,9 +44,9 @@ result_cube = data_cube.regrid(grid_cube) iris.save(result_cube, output_filepath) -.. NOTE:: - This solution is at present still experimental, and not itself fully thread-safe, - so probably only suitable for top-level global application. +.. WARNING:: + The solution in this module is at present still experimental, and not itself + thread-safe. So probably can only be applied at the outer level of an operation. """ from contextlib import contextmanager From 3433c29fbb8dc63a678ac28a63d88fb9b96eb3f0 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 11 Feb 2025 01:27:14 +0000 Subject: [PATCH 15/25] Completion of original review comments (mostly, a few from new set). --- docs/details/character_handling.rst | 61 ++++++++ docs/details/details_index.rst | 1 + docs/details/interface_support.rst | 31 +++- docs/details/known_issues.rst | 8 ++ docs/details/threadlock_sharing.rst | 2 +- docs/userdocs/user_guide/_snippets.rst | 89 ------------ .../userdocs/user_guide/common_operations.rst | 39 ++++- docs/userdocs/user_guide/data_objects.rst | 6 +- docs/userdocs/user_guide/general_topics.rst | 73 ++++------ docs/userdocs/user_guide/howtos.rst | 133 ++++++++++++------ lib/ncdata/dataset_like.py | 2 +- 11 files changed, 250 insertions(+), 195 deletions(-) create mode 100644 docs/details/character_handling.rst delete mode 100644 docs/userdocs/user_guide/_snippets.rst diff --git a/docs/details/character_handling.rst b/docs/details/character_handling.rst new file mode 100644 index 0000000..fcd29bf --- /dev/null +++ b/docs/details/character_handling.rst @@ -0,0 +1,61 @@ +.. _string-and-character-data: + +Character and String Data Handling +---------------------------------- +NetCDF can contain string and character data in at least 3 different contexts : + +Characters in Data Component Names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +That is, names of groups, variables, attributes or dimensions. +Component names in the API are just native Python strings. + +Since NetCDF version 4, the names of components within files are fully unicode +compliant, using UTF-8. + +These names can use virtually **any** characters, with the exception of the forward +slash "/", since in some technical cases a component name needs to specified as a +"path-like" compound. + + +Characters in Attribute Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Character data in string *attribute* values can likewise be read and written simply as +Python strings. + +However they are actually *stored* in an :class:`~ncdata.NcAttribute`'s +``.value`` as a character array of dtype "`. + +.. warning:: + + The netCDF4 package will perform automatic character encoding/decoding of a + character variable if it has a special ``_Encoding`` attribute. Ncdata does not + currently allow for this. See : :ref:`known-issues` + diff --git a/docs/details/details_index.rst b/docs/details/details_index.rst index 3d443cb..3e77dc0 100644 --- a/docs/details/details_index.rst +++ b/docs/details/details_index.rst @@ -8,6 +8,7 @@ Detail reference topics ../change_log ./known_issues ./interface_support + ./character_handling ./threadlock_sharing ./developer_notes diff --git a/docs/details/interface_support.rst b/docs/details/interface_support.rst index e3ce880..091582b 100644 --- a/docs/details/interface_support.rst +++ b/docs/details/interface_support.rst @@ -17,13 +17,24 @@ variable-length and user-defined datatypes. Please see : :ref:`data-types`. -Data Scaling, Masking and Compression -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Ncdata does not implement scaling and offset within data arrays : The ".data" +Data Scaling and Masking +^^^^^^^^^^^^^^^^^^^^^^^^ +Ncdata does not implement scaling and offset within variable data arrays : The ".data" array has the actual variable dtype, and the "scale_factor" and "add_offset" attributes are treated like any other attribute. -The existence of a "_FillValue" attribute controls how.. TODO +Likewise, Ncdata does not use masking within its variable data arrays, so that variable +data arrays contain "raw" data, which include any "fill" values -- i.e. at any missing +data points you will have a "fill" value rather than a masked point. + +The use of "scale_factor", "add_offset" and "_FillValue" attributes are standard +conventions described in the NetCDF documentation itself, and implemented by NetCDF +library software including the Python netCDF4 library. To ignore these default +interpretations, ncdata has to actually turn these features "off". The rationale for +this, however, is that the low-level unprocessed data content, equivalent to actual +file storage, may be more likely to form a stable common basis of equivalence, particularly +between different system architectures. + .. _file-storage: @@ -33,14 +44,20 @@ The :func:`ncdata.netcdf4.to_nc4` cannot control compression or storage options provided by :meth:`netCDF4.Dataset.createVariable`, which means you can't control the data compression and translation facilities of the NetCDF file library. -If required, you should use :mod:`iris` or :mod:`xarray` for this. +If required, you should use :mod:`iris` or :mod:`xarray` for this, i.e. use +:meth:`xarray.Dataset.to_netcdf` or :func:`iris.save` instead of +:func:`ncdata.netcdf4.to_nc4`, as these provide more special options for controlling +netcdf file creation. File-specific storage aspects, such as chunking, data-paths or compression strategies, are not recorded in the core objects. However, array representations in variable and attribute data (notably dask lazy arrays) may hold such information. -The concept of "unlimited" dimensions is also, arguably an exception. However, this is a -core provision in the NetCDF data model itself (see "Dimension" in the `NetCDF Classic Data Model`_). +The concept of "unlimited" dimensions is also, you might think, outside the abstract +model of NetCDF data and not of concern to Ncdata . However, in fact this concept is +present as a core property of dimensions in the classic NetCDF data model (see +"Dimension" in the `NetCDF Classic Data Model`_), so that is why it **is** an essential +property of an NcDimension also. Dask chunking control diff --git a/docs/details/known_issues.rst b/docs/details/known_issues.rst index d6071cd..af15887 100644 --- a/docs/details/known_issues.rst +++ b/docs/details/known_issues.rst @@ -1,3 +1,5 @@ +.. _known-issues: + Outstanding Issues ================== @@ -21,6 +23,12 @@ To be fixed * `issue#66 `_ +* in conversion to/from netCDF4 files + + * netCDF4 performs automatic encoding/decoding of byte data to characters, triggered + by the existence of an ``_Encoding`` attribute on a character type variable. + Ncdata does not currently account for this, and may fail to read/write correctly. + .. _todo: diff --git a/docs/details/threadlock_sharing.rst b/docs/details/threadlock_sharing.rst index ef8e30b..47c8b1f 100644 --- a/docs/details/threadlock_sharing.rst +++ b/docs/details/threadlock_sharing.rst @@ -12,7 +12,7 @@ created from netcdf file data, which it is either computing or storing to an output netcdf file. In short, this is not needed when all your data is loaded with only **one** of the data -packages (Iris, Xarray or ncata). The problem only occurs when you try to +packages (Iris, Xarray or ncdata). The problem only occurs when you try to realise/calculate/save results which combine data loaded from a mixture of sources. sample code:: diff --git a/docs/userdocs/user_guide/_snippets.rst b/docs/userdocs/user_guide/_snippets.rst deleted file mode 100644 index f291d76..0000000 --- a/docs/userdocs/user_guide/_snippets.rst +++ /dev/null @@ -1,89 +0,0 @@ -Snippets -======== - -Notes and writeups of handy description areas, that don't yet have a home. - -Data component (NameMap) dictionaries -------------------------------------- -For all of these properties, dictionary-style behaviour means that its ``.keys()`` -is a sequence of the content names, and ``.values()`` is a sequence of the contained -objects. - - -NcData ------- -The :class:`~ncdata.NcData` class represents either a dataset or group, -the structures of these are identical. - -NcAttributes ------------- -attributes are stored as NcAttribute objects, rather than simple name: value maps. -thus an 'attribute' of a NcVariable or NcData is an attribute object, not a value. - -Thus: - - >>> variable.attributes['x'] - NcAttribute('x', [1., 2., 7.]) - -The attribute has a ``.value`` property, but it is most usefully accessed with the -:meth:`~ncdata.NcAttribute.as_python_value()` method : - - >>> attr = NcAttribute('b', [1.]) - >>> attr.value - array([1.]) - >>> attr.as_python_value() - array(1.) - - >>> attr = NcAttribute('a', "this") - >>> attr.value - array('this', dtype='>> attr.as_python_value() - 'this' - -From within a parent object's ``.attributes`` dictionary, - - -Component Dictionaries ----------------------- -ordering -- insert, remove, rename effects -re-ordering - - -As described :ref:`above `, sub-components are stored under their names -in a dictionary container. - -Since all components have a name, and are stored by name in the parent property -dictionary (e.g. ``variable.attributes`` or ``data.dimensions``), the component -dictionaries have an :meth:`~ncdata.NameMap.add` method, which works with the component -name. -supported operations -^^^^^^^^^^^^^^^^^^^^ -standard dict methods : del, getitem, setitem, clear, append, extend -extra methods : add, addall - -ordering -^^^^^^^^ -For Python dictionaries in general, -since `announced in Python 3.7 `_, -the order of the entries is now a significant and stable feature of Python dictionaries. -There -Also as for Python dictionaries generally, there is no particular assistance for -managing or using the order. The following may give some indication: - -extract 'n'th item: ``data.variables[list(elelments.keys())[n]]`` -sort the list: - # get all the contents, sorted by name - content = list(data.attributes.values()) - content = sorted(content, key= lambda v: v.name) - # clear the container -- necessary to forget the old ordering - data.attributes.clear() - # add all back in, in the new order - data.attributes.addall(content) - -New entries are added last, and renamed entries retain their - -The :meth:`~ncdata.utils/dataset_differences` method reports differences in the -ordering of components (unless turned off). - - diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index 91316cd..43a0ae3 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -45,11 +45,42 @@ Example : ``dataset.variables.rename("x", "y")`` Copying ------- -All core objects support a ``.copy()`` method, which however does not copy array content -(e.g. variable data or attribute arrays). See for instance :meth:`ncdata.NcData.copy`. +All core objects support a ``.copy()`` method. See for instance +:meth:`ncdata.NcData.copy`. -There is also a utility function :func:`ncdata.utils.ncdata_copy`, this is effectively -the same as the NcData object copy. +These however do *not* copy variable data arrays (either real or lazy), but produce new +(copied) variables referencing the same arrays. So, for example: + +.. code-block:: + + >>> Construct a simple test dataset + >>> ds = NcData( + ... dimensions=[NcDimension('x', 12)], + ... variables=[NcVariable('vx', ['x'], np.ones(12))] + ... ) + + >>> # Make a copy + >>> ds_copy = ds.copy() + + >>> # The new dataset has a new matching variable with a matching data array + >>> # The variables are different .. + >>> ds_copy.variables['vx'] is ds.variables['vx'] + False + >>> # ... but the arrays are THE SAME ARRAY + >>> ds_copy.variables['vx'].data is ds.variables['vx'].data + True + + >>> # So changing one actually CHANGES THE OTHER ... + >>> ds.variables['vx'].data[6:] = 777 + >>> ds_copy.variables['vx'].data + array([1., 1., 1., 1., 1., 1., 777., 777., 777., 777., 777., 777.]) + +If needed you can of course replace variable data with copies yourself, since you can +freely assign to ``.data``. +For real data, this is just ``var.data = var.data.copy()``. + +There is also a utility function :func:`ncdata.utils.ncdata_copy` : This is +effectively the same thing as the NcData object :meth:`~ncdata.NcData.copy` method. Equality Checking diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index 051537a..22d4f6c 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -34,7 +34,7 @@ Notes : :class:`~ncdata.NcData` ^^^^^^^^^^^^^^^^^^^^^^^ -This represents a dataset containing variables, attributes and groups. +This represents a dataset containing variables, dimensions, attributes and groups. It is also used to represent groups. :class:`~ncdata.NcDimension` @@ -168,9 +168,7 @@ Thus to fetch an attribute you might write, for example one of these : but **not** ``unit = dataset.variables['x'].attributes['attr1']`` -And not ``unit = dataset.variables['x'].attributes['attr1']`` - -Or, likewise, to ***set*** values, one of +Or, likewise, to **set** values, one of .. code-block:: diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index 9620261..e7ba042 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -18,16 +18,18 @@ all use a subset of numpy **dtypes**, compatible with netcdf datatypes. These are effectively those defined by `netcdf4-python `_, and this therefore also effectively determines what we see in `dask arrays `_ . -However, at present ncdata directly supports only the so-called "Primitive Types" of the -`NetCDF Enhanced Data Model`_ : :ref:`data-model`. -So, this does ***not*** include the user-defined, enumerated or variable-length datatypes. +However, at present ncdata directly supports only the so-called "Primitive Types" of the NetCDF "Enhanced Data Model". +So, it does **not** include the user-defined, enumerated or variable-length datatypes. .. attention:: - In practice, we have found that at least variables of the variable-length "string" datatype do seem to function + In practice, we have found that at least variables of the variable-length "string" datatype **do** seem to function correctly at present, but this is not officially supported, and not currently tested. - We hope to extend support to the more general `NetCDF Enhanced Data Model`_ in future. + See also : :ref:`howto_load_variablewidth_strings` + + We hope to extend support to the more general `NetCDF Enhanced Data Model`_ in future + For reference, the currently supported + tested datatypes are : @@ -43,55 +45,26 @@ For reference, the currently supported + tested datatypes are : * double = numpy "f8" * char = numpy "U1" -.. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model - -.. _NetCDF Enhanced Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model - - -.. _string-and-character-data: - -Character and String Data Handling ----------------------------------- -NetCDF can contain string and character data in at least 3 different contexts : -Characters in Data Component Names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(i.e. groups, variables, attributes or dimensions) +Character and String Data +------------------------- +String and character data occurs in at least 3 different places : -Since NetCDF version 4, the names of components within files are fully unicode compliant -and can use virtually ***any*** characters, with the exception of the forward slash "/" -( since in some technical cases a component name needs to specified as a "path-like" compound ) +1. in names of components (e.g. variables) +2. in string attributes +3. in character-array data variables -Characters in Variable Data -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Character data in variable *data* arrays are generally stored as fixed-length arrays of -characters (i.e. fixed-width strings), and no unicode interpretation is applied by the -libraries (neither netCDF4 or ncdata). In this case, the strings appear in Python as -numpy character arrays of dtype "`. +NetCDF4 does also have provision for variable-length strings as an elemental type, +which you can have arrays of, but ncdata does not yet properly support this. -Characters in Attribute Values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Character data in string *attribute* values can be written simply as Python -strings. They are stored in an :class:`~ncdata.NcAttribute`'s ``.value`` as a -character array of dtype ">> variable.get_attr("x") + >>> variable.get_attrval("x") 3.0 - >>> dataset.get_attr("context") + >>> dataset.get_attrval("context") "Results from experiment A301.7" - >>> dataset.variables["q"].get_attr("level_settings") + >>> dataset.variables["q"].get_attrval("level_settings") [1.0, 2.5, 3.7] **Given an isolated** :class:`ncdata.NcAttribute` **instance** : @@ -139,7 +139,7 @@ which produces the same results as the above. >>> variable.attributes[myname].get_python_value() 3.0 -.. Note:: +.. Warning:: **Why Not Just...** use ``NcAttribute.value`` ? @@ -168,27 +168,24 @@ All attributes are writeable, and the type can be freely changed. .. code-block:: python >>> variable.set_attr("x", 3.) - >>> variable.get_attr("x") + >>> variable.get_attrval("x") 3.0 >>> variable.set_attr("x", "string-value") - >>> variable.get_attr("x") + >>> variable.get_attrval("x") "string-value" -.. Note:: - - **Why Not Just...** set ``NcAttribute.value`` directly ? - - For example +**Or** if you already have an attribute object in hand, you can simply set +``attribute.value`` directly : this a property with controlled access, so the +assigned value is cast with :func:`numpy.asarray`. - .. code-block:: python +For example - >>> data.variables["x"].attributes["q"].value = 4.2 +.. code-block:: python - This is generally unwise, because the ``.value`` should always be a numpy - :class:`~numpy.ndarray` array, with a suitable ``dtype``, but the - :class:`~ncdata.Ncattribute` type does not currently enforce this. - The ``set_attrval`` method both converts for convenience, and ensures that the - value is stored in a valid form. + >>> attr = data.variables["x"].attributes["q"] + >>> attr.value = 4.2 + >>> print(attr.value) + array(4.2) .. _howto_create_attr: @@ -259,6 +256,7 @@ holds a data array. >>> var.data = np.array([1, 2]) >>> print(var.data) + array([1, 2]) This may be either a :class:`numpy.ndarray` (real) or a :class:`dask.array.Array` (lazy) array. If the data is converted from another source (file, iris or xarray), @@ -392,7 +390,24 @@ passed using specific dictionary keywords, e.g. Combine data from different input files into one output ------------------------------------------------------- -This can be +This can be easily done by pasting elements from two sources into one output dataset. + +You can freely modify a loaded dataset, since it is no longer connected to the input +file. + +Just be careful that any shared dimensions match. + +.. code-block:: python + + >>> from ncdata.netcdf4 import from_nc4, to_nc4 + >>> data = from_nc4('input1.nc') + >>> data2 = from_nc4('input2.nc') + >>> # Add some known variables from file2 into file1 + >>> wanted = ('x1', 'x2', 'x3') + >>> for name in wanted: + ... data.variables.add(data.variables[name]) + ... + >>> to_nc4('output.nc') Create a brand-new dataset @@ -404,20 +419,20 @@ Contents and components can be attached on creation ... .. code-block:: python >>> data = NcData( - >>> dimensions=[NcDimension("y", 2), NcDimension("x", 3)], - >>> variables=[ - >>> NcVariable("y", ("y",), data=[0, 1]), - >>> NcVariable("x", ("x",), data=[0, 1, 2]), - >>> NcVariable( - >>> "vyx", ("y", "x"), - >>> data=np.zeros((2, 3)), - >>> attributes=[ - >>> NcAttribute("long_name", "rate"), - >>> NcAttribute("units", "m s-1") - >>> ] - >>> )], - >>> attributes=[NcAttribute("history", "imaginary")]) - ... + ... dimensions=[NcDimension("y", 2), NcDimension("x", 3)], + ... variables=[ + ... NcVariable("y", ("y",), data=[0, 1]), + ... NcVariable("x", ("x",), data=[0, 1, 2]), + ... NcVariable( + ... "vyx", ("y", "x"), + ... data=np.zeros((2, 3)), + ... attributes=[ + ... NcAttribute("long_name", "rate"), + ... NcAttribute("units", "m s-1") + ... ] + ... )], + ... attributes=[NcAttribute("history", "imaginary")] + ... ) >>> print(data) dimensions: @@ -426,7 +441,16 @@ Contents and components can be attached on creation ... variables: - ... + + + + global attributes: + :history = 'imaginary' + > + >>> ... or added iteratively ... @@ -450,6 +474,31 @@ Contents and components can be attached on creation ... Remove or rewrite specific attributes ------------------------------------- +Load an input dataset with :func:`ncdata.netcdf4.from_nc4`. + +Then you can modify, add or remove global and variable attributes at will. + +For example : + +.. code-block:: python + + >>> from ncdata.netcdf4 import from_nc4, to_nc4 + >>> ds = from_nc4('input.nc4') + >>> history = ds.get_attrval("history") if "history" in ds.attributes else "" + >>> ds.set_attrval("history", history + ": modified to SPEC-FIX.A") + >>> removes = ("grid_x", "review") + >>> for name in removes: + ... if name in ds.attributes: + ... del ds.attributes.[name] + ... + >>> for var in ds.variables.values(): + ... if "coords" in var.attributes: + ... var.attributes.rename("coords", "coordinates") # common non-CF problem + ... units = var.get_attrval("units") + ... if units and units == "ppm": + ... var.set_attrval("units", "1.e-6") # another common non-CF problem + ... + >>> to_nc(ds, "output_fixed.nc") Save selected variables to a new file diff --git a/lib/ncdata/dataset_like.py b/lib/ncdata/dataset_like.py index 82b7383..55af241 100644 --- a/lib/ncdata/dataset_like.py +++ b/lib/ncdata/dataset_like.py @@ -212,7 +212,7 @@ class Nc4VariableLike(_Nc4DatalikeWithNcattrs): * The ``._data_array`` property read/write interface then applies/removes any scaling and masking as it is to be "seen" from the outside. - That suits how *Iris* reads netCFD4 data, but it won't work if the user wants to + That suits how *Iris* reads netCDF4 data, but it won't work if the user wants to control the masking/saving behaviour, as you can do in netCDF4. Thus, at present, we do *not* provide any of the ``set_auto_mask/scale/maskandscale()`` methods. From 06cd859483934d398909b6982ea90d66fb37e691 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 00:57:02 +0000 Subject: [PATCH 16/25] Fixes to data types documentation. --- docs/userdocs/user_guide/general_topics.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index e7ba042..f8872fa 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -36,14 +36,14 @@ For reference, the currently supported + tested datatypes are : * unsigned byte = numpy "u1" * unsigned short = numpy "u2" * unsigned int = numpy "u4" -* unsigned long = numpy "u4" +* unsigned int64 = numpy "u4" * byte = numpy "i1" * short = numpy "i2" * int = numpy "i4" -* long = numpy "i8" +* int64 = numpy "i8" * float = numpy "f4" * double = numpy "f8" -* char = numpy "U1" +* char = numpy "S1" Character and String Data From 4e563c1211688285dd261b9680f280af0496cd27 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 01:04:46 +0000 Subject: [PATCH 17/25] Fix external link. --- docs/userdocs/user_guide/data_objects.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/userdocs/user_guide/data_objects.rst b/docs/userdocs/user_guide/data_objects.rst index 22d4f6c..dc81ab4 100644 --- a/docs/userdocs/user_guide/data_objects.rst +++ b/docs/userdocs/user_guide/data_objects.rst @@ -9,7 +9,7 @@ Data Classes ------------ The data model components are elements of the `NetCDF Classic Data Model`_ , plus **groups** (from the -`"enhanced" netCDF data model `_ ). +`"enhanced" netCDF data model`_ ). That is, a Dataset(File) consists of just Dimensions, Variables, Attributes and Groups. @@ -266,4 +266,4 @@ Relationship to File Storage See :ref:`file-storage` .. _NetCDF Classic Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#classic_model -.. _NetCDF Enhanced Data Model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model \ No newline at end of file +.. _"enhanced" netCDF data model: https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html#enhanced_model \ No newline at end of file From 41701f9a994adc21a8dd6655c95d4b3488d300ae Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 01:16:02 +0000 Subject: [PATCH 18/25] Fix list of core object container properties. --- docs/userdocs/user_guide/common_operations.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index 43a0ae3..d81e519 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -3,8 +3,8 @@ Common Operations ================= A group of common operations are available on all the core component types, -i.e. the operations of extract/remove/insert/rename/copy on the ``.datasets``, ``.groups``, -``.dimensions``, ``.variables`` and ``.attributes`` properties of the core objects. +i.e. the operations of extract/remove/insert/rename/copy on the ``.dimensions``, +``.variables``, ``.attributes`` and ``.groups`` properties of core objects. Most of these are hopefully "obvious" Pythonic methods of the container objects. From e5007f17469fc57d510c6b91bcf2b9fa50dc14ed Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 17:39:11 +0000 Subject: [PATCH 19/25] Fix bad formatting on installation page. --- .../userdocs/getting_started/installation.rst | 21 ++++++++++++++++--- .../userdocs/getting_started/introduction.rst | 5 ++--- .../userdocs/user_guide/common_operations.rst | 2 +- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/docs/userdocs/getting_started/installation.rst b/docs/userdocs/getting_started/installation.rst index 2be9443..1f5df84 100644 --- a/docs/userdocs/getting_started/installation.rst +++ b/docs/userdocs/getting_started/installation.rst @@ -4,13 +4,28 @@ Ncdata is available on PyPI and conda-forge Install from conda-forge with conda ----------------------------------- -Like this:: - conda install -c conda-forge ncdata +Like this: + +.. code-block:: bash + + $ conda install -c conda-forge ncdata Install from PyPI with pip -------------------------- -Like this:: +Like this: + +.. code-block:: bash + pip install ncdata +Check install +^^^^^^^^^^^^^ + +.. code-block:: bash + + $ python -c "from ncdata import NcData; print(NcData())" + + > + diff --git a/docs/userdocs/getting_started/introduction.rst b/docs/userdocs/getting_started/introduction.rst index f7dec43..3b9eace 100644 --- a/docs/userdocs/getting_started/introduction.rst +++ b/docs/userdocs/getting_started/introduction.rst @@ -177,13 +177,12 @@ There is also a 'rename' method of variables/attributes/groups:: > > -.. _renaming_dimensions: .. warning:: Renaming a :class:`~ncdata.NcDimension` within a :class:`~ncdata.NcData` - does *not* adjust the variables which reference it, since a variables' + does *not* adjust the variables which reference it, since a variable's :attr:`~ncdata.NcVariable.dimensions` is a simple list of names. - See : `renaming_dimensions`_ , also :func:`ncdata.utils.save_errors`. + See : `howto_rename_dimension`_ , also :func:`ncdata.utils.save_errors`. Converting between data formats diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index d81e519..9aa4cc8 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -28,7 +28,7 @@ Example : ``dataset.variables.add(NcVariable("x", dimensions=["x"], data=my_data An :meth:`~ncdata.NcAttribute` can also be added or set (if already present) with the special :meth:`~ncdata.NameMap.set_attrval` method. -Example : ``dataset.variables["x"].set_attrval["units", "m s-1")`` +Example : ``dataset.variables["x"].set_attrval("units", "m s-1")`` Rename ------ From a9afc60336c776885a177b0678c129b40384a291 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 18:21:14 +0000 Subject: [PATCH 20/25] More review changes + tweaks. --- docs/details/threadlock_sharing.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/details/threadlock_sharing.rst b/docs/details/threadlock_sharing.rst index 47c8b1f..bebecfd 100644 --- a/docs/details/threadlock_sharing.rst +++ b/docs/details/threadlock_sharing.rst @@ -15,7 +15,9 @@ In short, this is not needed when all your data is loaded with only **one** of t packages (Iris, Xarray or ncdata). The problem only occurs when you try to realise/calculate/save results which combine data loaded from a mixture of sources. -sample code:: +sample code: + +.. code-block:: python from ncdata.threadlock_sharing import enable_lockshare, disable_lockshare from ncdata.xarray import from_xarray @@ -31,7 +33,9 @@ sample code:: disable_lockshare() -or:: +... *or* ... + +.. code-block:: python with lockshare_context(iris=True): ncdata = NcData(source_filepath) @@ -59,16 +63,17 @@ sharing of large data arrays in memory. However, the python netCDF4 library (and the underlying C library) is not threadsafe (re-entrant) by design, neither does it implement any thread locking itself, therefore the “netcdf fetch” call in each input operation must be guarded by a mutex. -Thus contention is possible unless controlled by the calling packages. +Thus, contention is possible unless controlled by the calling packages. -*Each* of Xarray, Iris and ncdata itself create input data tasks to fetch sections of +Each of Xarray, Iris and ncdata create input data tasks to fetch sections of data from the input files. Each uses a mutex lock around netcdf accesses in those tasks, to stop them accessing the netCDF4 interface at the same time as any of the others. -This works beautifully until ncdata connects lazy data loaded with Iris (say) with -lazy data loaded from Xarray, which unfortunately are using their own separate mutexes -to protect the same netcdf library. Then, when we attempt to calculate or save this -result, we may get sporadic and unpredictable system-level errors, even a core-dump. +This works beautifully until ncdata connects (for example) lazy data loaded *with Iris* +with lazy data loaded *from Xarray*. These would then unfortunately each be using their +own *separate* mutexes to protect the same netcdf library. So, if we then attempt to +calculate or save the result, which combines data from both sources, we could get +sporadic and unpredictable system-level errors, even a core-dump type failure. So, the function of :mod:`ncdata.threadlock_sharing` is to connect the thread-locking schemes of the separate libraries, so that they cannot accidentally overlap an access From d526b0cd40a0398360191fd070b69cea73fc34d4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 18:31:45 +0000 Subject: [PATCH 21/25] Include basic changelog update in the release process docs. --- docs/change_log.rst | 5 +++++ docs/details/developer_notes.rst | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/docs/change_log.rst b/docs/change_log.rst index e846bda..611ae32 100644 --- a/docs/change_log.rst +++ b/docs/change_log.rst @@ -1,6 +1,10 @@ +.. _change_log: + Versions and Change Notes ========================= +.. _development_status: + Project Development Status -------------------------- We intend to follow `PEP 440 `_, @@ -13,6 +17,7 @@ This is a complete implementation, with functional operational of all public API The code is however still experimental, and APIs are not stable (hence no major version yet). +.. _change_notes: Change Notes ------------ diff --git a/docs/details/developer_notes.rst b/docs/details/developer_notes.rst index 23c0708..7271394 100644 --- a/docs/details/developer_notes.rst +++ b/docs/details/developer_notes.rst @@ -28,6 +28,12 @@ Documentation build Release actions --------------- +#. Update the :ref:`change_log` page in the details section + + #. ensure all major changes + PRs are referenced in the :ref:`change_notes` section + + #. update the "latest version" stated in the :ref:`development_status` section + #. Cut a release on GitHub : this triggers a new docs version on [ReadTheDocs](https://readthedocs.org/projects/ncdata/) #. Build the distribution From 12eb3a28f69896c15a3fefd850b882a5da9d08da Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 12 Feb 2025 18:47:06 +0000 Subject: [PATCH 22/25] Fix code blocks in introduction. --- .../userdocs/getting_started/introduction.rst | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/docs/userdocs/getting_started/introduction.rst b/docs/userdocs/getting_started/introduction.rst index 3b9eace..d89f743 100644 --- a/docs/userdocs/getting_started/introduction.rst +++ b/docs/userdocs/getting_started/introduction.rst @@ -31,7 +31,9 @@ Simple data creation The :class:`ncdata.NcData` object is the basic container, representing a dataset or group. It contains :attr:`~ncdata.NcData.dimensions`, :attr:`~ncdata.NcData.variables`, :attr:`~ncdata.NcData.groups`, -and :attr:`~ncdata.NcData.attributes`:: +and :attr:`~ncdata.NcData.attributes`: + +.. code-block:: python >>> from ncdata import NcData, NcDimension, NcVariable >>> data = NcData("myname") @@ -58,7 +60,9 @@ Getting data to+from files The :mod:`ncdata.netcdf4` module provides simple means of reading and writing NetCDF files via the `netcdf4-python package `_. -Simple example:: +Simple example: + +.. code-block:: python >>> from ncdata.netcdf4 import to_nc4, from_nc4 @@ -85,7 +89,9 @@ Please see `Converting between data formats`_ for more details. Variables ^^^^^^^^^ Variables live in a :attr:`ncdata.NcData.variables` attribute, -which behaves like a dictionary:: +which behaves like a dictionary: + +.. code-block:: python >>> var = NcVariable("vx", dimensions=["x"], dtype=float) >>> data.variables.add(var) @@ -109,7 +115,9 @@ which behaves like a dictionary:: Attributes ^^^^^^^^^^ Variables live in the ``attributes`` property of a :class:`~ncdata.NcData` -or :class:`~ncdata.NcVariable`:: +or :class:`~ncdata.NcVariable`: + +.. code-block:: python >>> var.set_attrval('a', 1) NcAttribute('a', 1) @@ -150,7 +158,9 @@ and :meth:`~ncdata.NcVariable.get_attrval` of NcData/NcVariable. Deletion and Renaming ^^^^^^^^^^^^^^^^^^^^^ -Use python 'del' operation to remove:: +Use python 'del' operation to remove: + +.. code-block:: python >>> del var.attributes['a'] >>> print(var) @@ -158,7 +168,9 @@ Use python 'del' operation to remove:: vx:b = 'this' > -There is also a 'rename' method of variables/attributes/groups:: +There is also a 'rename' method of variables/attributes/groups: + +.. code-block:: python >>> var.attributes.rename("b", "qq") >>> print(var) @@ -182,7 +194,7 @@ There is also a 'rename' method of variables/attributes/groups:: Renaming a :class:`~ncdata.NcDimension` within a :class:`~ncdata.NcData` does *not* adjust the variables which reference it, since a variable's :attr:`~ncdata.NcVariable.dimensions` is a simple list of names. - See : `howto_rename_dimension`_ , also :func:`ncdata.utils.save_errors`. + See : :ref:`howto_rename_dimension` , also :func:`ncdata.utils.save_errors`. Converting between data formats @@ -216,21 +228,31 @@ at :ref:`interface_support`. Example code snippets : +.. code-block:: python + >>> from ndata.threadlock_sharing import enable_lockshare >>> enable_lockshare(iris=True, xarray=True) +.. code-block:: python + >>> from ncdata.netcdf import from_nc4 >>> ncdata = from_nc4("datapath.nc") +.. code-block:: python + >>> from ncdata.iris import to_iris, from_iris >>> xx, yy = to_iris(ncdata, ['x_wind', 'y_wind']) >>> vv = (xx * xx + yy * yy) ** 0.5 >>> vv.units = xx.units +.. code-block:: python + >>> from ncdata.xarray import to_xarray >>> xrds = to_xarray(from_iris(vv)) >>> xrds.to_zarr(out_path) +.. code-block:: python + >>> from ncdata.iris_xarray import cubes_from_xarray >>> vv2 = cubes_from_xarray(xrds) >>> assert vv2 == vv @@ -245,6 +267,8 @@ Thread safety prevent possible errors when computing or saving lazy data. For example: + .. code-block:: python + >>> from ndata.threadlock_sharing import enable_lockshare >>> enable_lockshare(iris=True, xarray=True) From af28511adcdb11f804ddfb890f624b9b3223d0b7 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2025 08:46:39 +0000 Subject: [PATCH 23/25] Update docs/userdocs/user_guide/general_topics.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/general_topics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/userdocs/user_guide/general_topics.rst b/docs/userdocs/user_guide/general_topics.rst index f8872fa..ea890aa 100644 --- a/docs/userdocs/user_guide/general_topics.rst +++ b/docs/userdocs/user_guide/general_topics.rst @@ -57,7 +57,7 @@ String and character data occurs in at least 3 different places : Very briefly : * types (1) and (2) are equivalent to Python strings and may include unicode -* type (2) are equivalent to character (byte) arrays, and normally represent only +* type (3) are equivalent to character (byte) arrays, and normally represent only fixed-length strings with the length being given as a file dimension. NetCDF4 does also have provision for variable-length strings as an elemental type, From 531a3f66e5a1417d3e8c2f07ee3d81bc6267173f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2025 08:49:12 +0000 Subject: [PATCH 24/25] Update docs/userdocs/user_guide/howtos.rst Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/userdocs/user_guide/howtos.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst index 74ecf34..74caa5f 100644 --- a/docs/userdocs/user_guide/howtos.rst +++ b/docs/userdocs/user_guide/howtos.rst @@ -405,9 +405,9 @@ Just be careful that any shared dimensions match. >>> # Add some known variables from file2 into file1 >>> wanted = ('x1', 'x2', 'x3') >>> for name in wanted: - ... data.variables.add(data.variables[name]) + ... data.variables.add(data2.variables[name]) ... - >>> to_nc4('output.nc') + >>> to_nc4(data, 'output.nc') Create a brand-new dataset From 695d46396891d3c9faee8e79cf8ef5e1aaca6405 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2025 10:33:05 +0000 Subject: [PATCH 25/25] Fix some api reference links. --- docs/details/threadlock_sharing.rst | 6 +++--- lib/ncdata/utils/_compare_nc_datasets.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/details/threadlock_sharing.rst b/docs/details/threadlock_sharing.rst index bebecfd..f07e026 100644 --- a/docs/details/threadlock_sharing.rst +++ b/docs/details/threadlock_sharing.rst @@ -49,11 +49,11 @@ Background In practice, Iris, Xarray and Ncdata are all capable of scanning netCDF files and interpreting their metadata, while not reading all the core variable data contained in them. -This generates objects containing `Dask arrays `_ with deferred access -to bulk file data for later access, with certain key benefits : +This generates objects containing Dask :class:`~dask.array.Array`\s, which provide +deferred access to bulk data in files, with certain key benefits : * no data loading or calculation happens until needed -* the work is divided into sectional ‘tasks’, of which only some may ultimately be needed +* the work is divided into sectional "tasks", of which only some may ultimately be needed * it may be possible to perform multiple sections of calculation (including data fetch) in parallel * it may be possible to localise operations (fetch or calculate) near to data distributed across a cluster diff --git a/lib/ncdata/utils/_compare_nc_datasets.py b/lib/ncdata/utils/_compare_nc_datasets.py index fe3e469..28f5d28 100644 --- a/lib/ncdata/utils/_compare_nc_datasets.py +++ b/lib/ncdata/utils/_compare_nc_datasets.py @@ -34,8 +34,9 @@ def dataset_differences( r""" Compare two netcdf datasets. - Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\s or :class:`NcData` - objects. File paths are opened with :mod:`netCDF4`. + Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\s or + :class:`~ncdata.NcData` objects. + File paths are opened with the :mod:`netCDF4` module. Parameters ----------