diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index d635a852..cba45305 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -17,7 +17,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2022, macos-13, macos-14] + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2022, macos-14, macos-15] steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2b0b6f61..8d91b791 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,15 @@ Changelog ========= +4.1.4 - 2025-12-16 +------------------ + +**Other changes:** + +- :func:`tabmat.from_df` now avoids unnecessary copies of dense arrays, but still ensures that the results are contiguous (C or F order). +- We now use `narwhals`' v2 API for data frame handling. + + 4.1.3 - 2025-11-12 ------------------ diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 45fe6cb7..85b8f295 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -509,7 +509,7 @@ def matvec( is_int = np.issubdtype(other.dtype, np.signedinteger) if is_int: - other_m = other.astype(float) # type: ignore + other_m: np.ndarray = other.astype(float) else: other_m = other diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 631358ab..f47ea25b 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -182,7 +182,7 @@ def from_df( if dense_dfidx: matrices.append( DenseMatrix( - df[:, dense_dfidx].to_numpy().astype(dtype), + df[:, dense_dfidx].to_numpy().astype(dtype, copy=False), column_names=np.asarray(df.columns)[dense_dfidx], term_names=np.asarray(df.columns)[dense_dfidx], ) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 907230e6..7daeeff6 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,4 +1,5 @@ import textwrap +import warnings from typing import Optional, Union import numpy as np @@ -43,6 +44,19 @@ def __init__(self, input_array, column_names=None, term_names=None): elif input_array.ndim > 2: raise ValueError("Input array must be 1- or 2-dimensional") + # Ensure array is contiguous (C or F order) for Cython operations + # Only copy if necessary + if ( + not input_array.flags["C_CONTIGUOUS"] + and not input_array.flags["F_CONTIGUOUS"] + ): + warnings.warn( + "Input array is not contiguous; making a copy.", + UserWarning, + stacklevel=2, + ) + input_array = np.asfortranarray(input_array) + self._array = input_array width = self._array.shape[1] diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 0f97af3a..b5b53d40 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -92,7 +92,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): if drop_rows: values = values.drop(index=values.index[drop_rows]) if isinstance(values, pd.Series): - values = values.to_numpy().astype(self.dtype) + values = values.to_numpy().astype(self.dtype, copy=False) if (values != 0).mean() <= self.sparse_threshold: return _InteractableSparseVector(sps.csc_matrix(values[:, np.newaxis])) else: diff --git a/tests/test_fast_sandwich.py b/tests/test_fast_sandwich.py index 3b564cd4..bcb2108e 100644 --- a/tests/test_fast_sandwich.py +++ b/tests/test_fast_sandwich.py @@ -64,25 +64,30 @@ def test_fast_sandwich_dense(): def test_dense_sandwich_on_non_contiguous(): - """Non-regression test for #208""" + """Non-regression test for #208 + + DenseMatrix now automatically ensures arrays are contiguous, + so non-contiguous inputs are automatically copied and made contiguous. + """ rng = np.random.default_rng(seed=123) X = rng.standard_normal(size=(100, 20)) - # Xd wraps a not-contiguous array. - Xd = DenseMatrix(X[:, :10]) - Xs = SparseMatrix(csc_matrix(X[:, 10:])) - Xm = SplitMatrix([Xd, Xs]) + # Column slicing creates a non-contiguous array, but DenseMatrix + # automatically makes it contiguous (copying only if necessary). + non_contiguous_array = X[:, :10] + assert not non_contiguous_array.flags["C_CONTIGUOUS"] + assert not non_contiguous_array.flags["F_CONTIGUOUS"] - # Making the sandwich product fail. - with pytest.raises(Exception, match="The matrix X is not contiguous"): - Xm.sandwich(np.ones(X.shape[0])) + Xd = DenseMatrix(non_contiguous_array) + # The internal array should now be contiguous + assert Xd.A.flags["C_CONTIGUOUS"] or Xd.A.flags["F_CONTIGUOUS"] - # Xd wraps a copy, which makes the data contiguous. - Xd = DenseMatrix(X[:, :10].copy()) + Xs = SparseMatrix(csc_matrix(X[:, 10:])) Xm = SplitMatrix([Xd, Xs]) - # The sandwich product works without problem here. - Xm.sandwich(np.ones(X.shape[0])) + # The sandwich product should work without problem + result = Xm.sandwich(np.ones(X.shape[0])) + assert result is not None def check(A, d, cols):