diff --git a/pyproject.toml b/pyproject.toml index d3ad1e6..b2c1d36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "pandas>=2", "numpy>=2", "xproj>=0.2.0", - "xarray>=2025", + "xarray @ git+https://github.com/dcherian/xarray.git@fix-coord-transform-indexing", ] dynamic=["version"] @@ -75,6 +75,9 @@ docs = [ [tool.hatch] version.source = "vcs" +[tool.hatch.metadata] +allow-direct-references = true + [tool.hatch.build] hooks.vcs.version-file = "src/rasterix/_version.py" diff --git a/src/rasterix/raster_index.py b/src/rasterix/raster_index.py index 3a7485b..d260f63 100644 --- a/src/rasterix/raster_index.py +++ b/src/rasterix/raster_index.py @@ -310,17 +310,20 @@ def isel( # type: ignore[override] def sel(self, labels, method=None, tolerance=None): coord_name = self.axis_transform.coord_name label = labels[coord_name] - + transform = self.axis_transform if isinstance(label, slice): - if label.start is None: - label = slice(0, label.stop, label.step) + label = slice( + label.start or transform.forward({coord_name: 0})[coord_name], + label.stop or transform.forward({coord_name: transform.size})[coord_name], + label.step, + ) if label.step is None: # continuous interval slice indexing (preserves the index) pos = self.transform.reverse({coord_name: np.array([label.start, label.stop])}) # np.round rounds to even, this way we round upwards pos = np.floor(pos[self.dim] + 0.5).astype("int") new_start = max(pos[0], 0) - new_stop = min(pos[1], self.axis_transform.size) + new_stop = min(pos[1] + 1, self.axis_transform.size) return IndexSelResult({self.dim: slice(new_start, new_stop)}) else: # otherwise convert to basic (array) indexing diff --git a/src/rasterix/strategies.py b/src/rasterix/strategies.py new file mode 100644 index 0000000..4c1f03e --- /dev/null +++ b/src/rasterix/strategies.py @@ -0,0 +1,165 @@ +"""Hypothesis strategies for generating label-based indexers.""" + +from collections.abc import Hashable +from typing import Any + +import numpy as np +import pandas as pd +import xarray as xr +from hypothesis import note +from hypothesis import strategies as st +from xarray.core.indexes import Indexes +from xarray.testing.strategies import ( + basic_indexers, + outer_array_indexers, + vectorized_indexers, +) + + +def pos_to_label_indexer(idx: pd.Index, idxr: int | slice | np.ndarray, *, use_scalar: bool = True) -> Any: + """Convert a positional indexer to a label-based indexer. + + Parameters + ---------- + idx : pd.Index + The pandas Index to use for label lookup. + idxr : int | slice | np.ndarray + The positional indexer (integer, slice, or array of integers). + use_scalar : bool, optional + If True, attempt to convert scalar values to Python scalars. Default is True. + + Returns + ------- + Any + The label-based indexer (scalar, slice, or array of labels). + """ + if isinstance(idxr, slice): + return slice( + None if idxr.start is None else idx[idxr.start], + # FIXME: This will never go past the label range + None if idxr.stop is None else idx[min(idxr.stop, idx.size - 1)], + ) + elif isinstance(idxr, np.ndarray): + # Convert array of position indices to array of label values + return idx[idxr].values + else: + val = idx[idxr] + if use_scalar: + try: + # pass python scalars occasionally + val = val.item() + except Exception: + note(f"casting {val!r} to item() failed") + pass + return val + + +@st.composite +def basic_label_indexers(draw, /, *, indexes: Indexes) -> dict[Hashable, float | slice]: + """Generate label-based indexers by converting position indexers to labels. + + This works in label space by using the coordinate Index values. + + Parameters + ---------- + draw : callable + The Hypothesis draw function (automatically provided by @st.composite). + indexes : Indexes + Dictionary mapping dimension names to their associated indexes + + Returns + ------- + dict[Hashable, float | slice] + Label-based indexers as a dict with keys from sizes.keys(). + Values are either float (for scalar labels) or slice (for label ranges). + """ + idxs = indexes.get_unique() + assert all(isinstance(idx, xr.indexes.PandasIndex) for idx in idxs) + + # FIXME: this should be indexes.sizes! + sizes = indexes.dims + + pos_indexer = draw(basic_indexers(sizes=sizes)) + pdindexes = indexes.to_pandas_indexes() + + label_indexer = { + dim: pos_to_label_indexer(pdindexes[dim], idx, use_scalar=draw(st.booleans())) + for dim, idx in pos_indexer.items() + } + return label_indexer + + +@st.composite +def outer_array_label_indexers(draw, /, *, indexes: Indexes) -> dict[Hashable, np.ndarray]: + """Generate label-based outer array indexers by converting position indexers to labels. + + This works in label space by using the coordinate Index values. + + Parameters + ---------- + draw : callable + The Hypothesis draw function (automatically provided by @st.composite). + indexes : Indexes + Dictionary mapping dimension names to their associated indexes + + Returns + ------- + dict[Hashable, np.ndarray] + Label-based indexers as a dict with keys from indexes. + Values are numpy arrays of label values for each dimension. + """ + idxs = indexes.get_unique() + assert all(isinstance(idx, xr.indexes.PandasIndex) for idx in idxs) + + # FIXME: this should be indexes.sizes! + sizes = indexes.dims + + pos_indexer = draw(outer_array_indexers(sizes=sizes)) + pdindexes = indexes.to_pandas_indexes() + + label_indexer = { + dim: pos_to_label_indexer(pdindexes[dim], idx, use_scalar=False) for dim, idx in pos_indexer.items() + } + return label_indexer + + +@st.composite +def vectorized_label_indexers(draw, /, *, indexes: Indexes, **kwargs) -> dict[Hashable, xr.DataArray]: + """Generate label-based vectorized indexers by converting position indexers to labels. + + This works in label space by using the coordinate Index values. + + Parameters + ---------- + draw : callable + The Hypothesis draw function (automatically provided by @st.composite). + indexes : Indexes + Dictionary mapping dimension names to their associated indexes + **kwargs : dict + Additional keyword arguments to pass to vectorized_indexers + + Returns + ------- + dict[Hashable, xr.DataArray] + Label-based indexers as a dict with keys from indexes. + Values are DataArrays of label values for each dimension. + """ + idxs = indexes.get_unique() + assert all(isinstance(idx, xr.indexes.PandasIndex) for idx in idxs) + + # FIXME: this should be indexes.sizes! + sizes = indexes.dims + + pos_indexer = draw(vectorized_indexers(sizes=sizes, **kwargs)) + pdindexes = indexes.to_pandas_indexes() + + label_indexer = {} + for dim, idx_array in pos_indexer.items(): + # Convert each position in the array to its corresponding label + # Flatten, index, then reshape back to original shape + flat_indices = idx_array.values.ravel() + flat_labels = pdindexes[dim][flat_indices].values + label_values = flat_labels.reshape(idx_array.shape) + label_indexer[dim] = xr.DataArray(label_values, dims=idx_array.dims) + + return label_indexer diff --git a/tests/test_indexing.py b/tests/test_indexing.py new file mode 100644 index 0000000..ba22e55 --- /dev/null +++ b/tests/test_indexing.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +"""Property tests comparing RasterIndex with PandasIndex for indexing operations.""" + +from collections.abc import Hashable + +import numpy as np +import pytest +import xarray as xr +from affine import Affine +from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st +from xarray.testing.strategies import ( + basic_indexers, + outer_array_indexers, + vectorized_indexers, +) + +from rasterix import RasterIndex +from rasterix.strategies import ( + basic_label_indexers, + outer_array_label_indexers, + vectorized_label_indexers, +) + + +def is_mixed_scalar_slice_indexer(indexers: dict[Hashable, int | slice]) -> bool: + # TODO: Fix bug in RasterIndex with mixed scalar/slice indexing across dimensions + # When you have scalar indexing on one dimension (e.g., y=0) and slice indexing + # on another (e.g., x=slice(None, 1)), RasterIndex.isel() returns None for the + # scalar dimension, dropping that index. This causes xarray to incorrectly handle + # the coordinate variables - the sliced dimension's coordinate (x) maintains + # dims ('x',) even though the data has been reduced by the scalar indexing. + # This results in: "ValueError: dimensions ('x',) must have the same length as + # the number of data dimensions, ndim=0" + # + # Example failing case: raster_da.isel(y=0, x=slice(None, 1)) + # - y=0 causes RasterIndex to return None for y dimension + # - x=slice(None, 1) preserves RasterIndex for x dimension + # - Result: coordinate variable x has wrong dimensionality + # + # For now, filter out these cases using hypothesis.assume() + has_scalar = any(isinstance(v, int | np.integer) for v in indexers.values()) + has_slice = any(isinstance(v, slice) for v in indexers.values()) + return has_scalar and has_slice and len(indexers) > 1 + + +@pytest.fixture +def raster_da(): + """Create a DataArray with RasterIndex coordinates.""" + width, height = 10, 8 + transform = Affine.translation(0, 0) * Affine.scale(1.0, -1.0) + + # Create data + data = np.arange(width * height, dtype=np.float64).reshape(height, width) + + # Create RasterIndex + index = RasterIndex.from_transform( + transform, + width=width, + height=height, + x_dim="x", + y_dim="y", + ) + + # Create coordinates from index + coords = xr.Coordinates.from_xindex(index) + + # Create DataArray + da = xr.DataArray( + data, + dims=("y", "x"), + coords=coords, + name="data", + ) + + return da + + +@pytest.fixture +def pandas_da(raster_da): + """Create a DataArray with PandasIndex by converting from raster_da.""" + x_values = raster_da.x.values + y_values = raster_da.y.values + da = xr.DataArray( + raster_da.values, dims=raster_da.dims, coords={"x": x_values, "y": y_values}, name="data" + ) + return da + + +@given(data=st.data()) +@settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) +def test_isel_basic_indexing_equivalence(data, raster_da, pandas_da): + """Test that isel produces identical results for RasterIndex and PandasIndex.""" + sizes = dict(raster_da.sizes) + indexers = data.draw( + basic_indexers(sizes=sizes).filter(lambda idxr: not is_mixed_scalar_slice_indexer(idxr)) + ) + result_raster = raster_da.isel(indexers) + result_pandas = pandas_da.isel(indexers) + xr.testing.assert_identical(result_raster, result_pandas) + + +@given(data=st.data()) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +def test_sel_basic_indexing_equivalence(data, raster_da, pandas_da): + """Test that isel produces identical results for RasterIndex and PandasIndex.""" + indexers = data.draw(basic_label_indexers(indexes=pandas_da.xindexes)) + + result_raster = raster_da.sel( + indexers, + method=("nearest" if any(np.isscalar(idxr) for idxr in indexers.values()) else None), + ) + result_pandas = pandas_da.sel(indexers) + + if all(isinstance(idxr, slice) for idxr in indexers.values()): + assert all(isinstance(idx, RasterIndex) for idx in result_raster.xindexes.get_unique()) + + xr.testing.assert_identical(result_raster, result_pandas) + + +def test_simple_isel(raster_da, pandas_da): + """Sanity check: simple indexing operations.""" + # Scalar indexing + xr.testing.assert_identical(raster_da.isel(x=0), pandas_da.isel(x=0)) + xr.testing.assert_identical(raster_da.isel(y=0), pandas_da.isel(y=0)) + xr.testing.assert_identical(raster_da.isel(x=0, y=0), pandas_da.isel(x=0, y=0)) + + # Slice indexing + xr.testing.assert_identical(raster_da.isel(x=slice(2, 5)), pandas_da.isel(x=slice(2, 5))) + xr.testing.assert_identical(raster_da.isel(y=slice(1, 4)), pandas_da.isel(y=slice(1, 4))) + xr.testing.assert_identical( + raster_da.isel(x=slice(2, 5), y=slice(1, 4)), + pandas_da.isel(x=slice(2, 5), y=slice(1, 4)), + ) + + # Array indexing + xr.testing.assert_identical(raster_da.isel(x=[0, 2, 4]), pandas_da.isel(x=[0, 2, 4])) + xr.testing.assert_identical(raster_da.isel(y=[1, 3]), pandas_da.isel(y=[1, 3])) + + +@given(data=st.data()) +@settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) +def test_outer_array_indexing(data, raster_da, pandas_da): + """Test that outer array indexing produces identical results for RasterIndex and PandasIndex.""" + sizes = dict(raster_da.sizes) + indexers = data.draw(outer_array_indexers(sizes=sizes)) + + result_raster = raster_da.isel(indexers) + result_pandas = pandas_da.isel(indexers) + + xr.testing.assert_identical(result_raster, result_pandas) + + +@given(data=st.data()) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +def test_outer_array_label_indexing(data, raster_da, pandas_da): + """Test that outer array label indexing produces identical results for RasterIndex and PandasIndex.""" + indexers = data.draw(outer_array_label_indexers(indexes=pandas_da.xindexes)) + result_raster = raster_da.sel(indexers, method="nearest") + result_pandas = pandas_da.sel(indexers, method="nearest") + xr.testing.assert_identical(result_raster, result_pandas) + + +@given(data=st.data()) +@settings(max_examples=200, suppress_health_check=[HealthCheck.function_scoped_fixture]) +def test_vectorized_indexing(data, raster_da, pandas_da): + """Test that vectorized indexing produces identical results for RasterIndex and PandasIndex.""" + sizes = dict(raster_da.sizes) + indexers = data.draw(vectorized_indexers(sizes=sizes)) + result_raster = raster_da.isel(indexers) + result_pandas = pandas_da.isel(indexers) + xr.testing.assert_identical(result_raster, result_pandas) + + +@given(data=st.data()) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +def test_vectorized_label_indexing(data, raster_da, pandas_da): + """Test that vectorized label indexing produces identical results for RasterIndex and PandasIndex.""" + indexers = data.draw(vectorized_label_indexers(indexes=pandas_da.xindexes)) + result_raster = raster_da.sel(indexers, method="nearest") + result_pandas = pandas_da.sel(indexers, method="nearest") + xr.testing.assert_identical(result_raster, result_pandas)