diff --git a/doc/api.rst b/doc/api.rst index d180aa66d25..34d6558ed55 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -632,6 +632,7 @@ DataArray methods DataArray.from_iris DataArray.from_series DataArray.to_cdms2 + DataArray.to_dask_dataframe DataArray.to_dataframe DataArray.to_dataset DataArray.to_dict diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8f9d22661ac..c74134708e1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ v2023.05.0 (unreleased) New Features ~~~~~~~~~~~~ +- Added new method :py:meth:`DataArray.to_dask_dataframe`, convert a dataarray into a dask dataframe (:issue:`7409`). + By `Deeksha `_. - Add support for lshift and rshift binary operators (`<<`, `>>`) on :py:class:`xr.DataArray` of type :py:class:`int` (:issue:`7727` , :pull:`7741`). By `Alan Brammer `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7c2d02f7b8e..7ff3a7765c9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -57,6 +57,10 @@ from numpy.typing import ArrayLike + try: + from dask.dataframe import DataFrame as DaskDataFrame + except ImportError: + DaskDataFrame = None # type: ignore try: from dask.delayed import Delayed except ImportError: @@ -6888,6 +6892,71 @@ def resample( **indexer_kwargs, ) + def to_dask_dataframe( + self, + dim_order: Sequence[Hashable] | None = None, + set_index: bool = False, + ) -> DaskDataFrame: + """Convert this array into a dask.dataframe.DataFrame. + + Parameters + ---------- + dim_order : Sequence of Hashable or None , optional + Hierarchical dimension order for the resulting dataframe. + Array content is transposed to this order and then written out as flat + vectors in contiguous order, so the last dimension in this list + will be contiguous in the resulting DataFrame. This has a major influence + on which operations are efficient on the resulting dask dataframe. + set_index : bool, default: False + If set_index=True, the dask DataFrame is indexed by this dataset's + coordinate. Since dask DataFrames do not support multi-indexes, + set_index only works if the dataset only contains one dimension. + + Returns + ------- + dask.dataframe.DataFrame + + Examples + -------- + >>> da = xr.DataArray( + ... np.arange(4 * 2 * 2).reshape(4, 2, 2), + ... dims=("time", "lat", "lon"), + ... coords={ + ... "time": np.arange(4), + ... "lat": [-30, -20], + ... "lon": [120, 130], + ... }, + ... name="eg_dataarray", + ... attrs={"units": "Celsius", "description": "Random temperature data"}, + ... ) + >>> da.to_dask_dataframe(["lat", "lon", "time"]).compute() + lat lon time eg_dataarray + 0 -30 120 0 0 + 1 -30 120 1 4 + 2 -30 120 2 8 + 3 -30 120 3 12 + 4 -30 130 0 1 + 5 -30 130 1 5 + 6 -30 130 2 9 + 7 -30 130 3 13 + 8 -20 120 0 2 + 9 -20 120 1 6 + 10 -20 120 2 10 + 11 -20 120 3 14 + 12 -20 130 0 3 + 13 -20 130 1 7 + 14 -20 130 2 11 + 15 -20 130 3 15 + """ + if self.name is None: + raise ValueError( + "Cannot convert an unnamed DataArray to a " + "dask dataframe : use the ``.rename`` method to assign a name." + ) + name = self.name + ds = self._to_dataset_whole(name, shallow_copy=False) + return ds.to_dask_dataframe(dim_order, set_index) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor["DataArray"]) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 6f6286d4325..263653e992e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3245,6 +3245,39 @@ def test_to_dataframe_0length(self) -> None: assert len(actual) == 0 assert_array_equal(actual.index.names, list("ABC")) + @requires_dask + def test_to_dask_dataframe(self) -> None: + arr_np = np.arange(3 * 4).reshape(3, 4) + arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo") + expected = arr.to_series() + actual = arr.to_dask_dataframe()["foo"] + + assert_array_equal(actual.values, expected.values) + + actual = arr.to_dask_dataframe(dim_order=["A", "B"])["foo"] + assert_array_equal(arr_np.transpose().reshape(-1), actual.values) + + # regression test for coords with different dimensions + + arr.coords["C"] = ("B", [-1, -2, -3]) + expected = arr.to_series().to_frame() + expected["C"] = [-1] * 4 + [-2] * 4 + [-3] * 4 + expected = expected[["C", "foo"]] + actual = arr.to_dask_dataframe()[["C", "foo"]] + + assert_array_equal(expected.values, actual.values) + assert_array_equal(expected.columns.values, actual.columns.values) + + with pytest.raises(ValueError, match="does not match the set of dimensions"): + arr.to_dask_dataframe(dim_order=["B", "A", "C"]) + + arr.name = None + with pytest.raises( + ValueError, + match="Cannot convert an unnamed DataArray", + ): + arr.to_dask_dataframe() + def test_to_pandas_name_matches_coordinate(self) -> None: # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x")