Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ Changelog
4.0.1 - 2024-06-25
------------------

**New feature:**

- Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`.

**Other changes:**

- Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
- Add support between formulaic and pandas 3.0
- Add support between formulaic and pandas 3.0.
- Support pypi release for numpy 2.0

4.0.0 - 2024-04-23
Expand Down
3 changes: 1 addition & 2 deletions conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ requirements:
run:
- python
- {{ pin_compatible('numpy') }}
- pandas
- scipy
- formulaic>=0.6
- scipy

test:
requires:
Expand Down
6 changes: 5 additions & 1 deletion environment-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ channels:
- nodefaults
dependencies:
- libblas>=0=*mkl
- pandas
- formulaic>=0.6
- numpy
- pandas
- polars
- pyarrow # exclusively for polars tests
- scipy

# development tools
- click
Expand Down
6 changes: 5 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ channels:
- conda-forge
- nodefaults
dependencies:
- pandas
- formulaic>=0.6
- numpy
- pandas
- polars
- pyarrow # exclusively for polars tests
- scipy

# development tools
- click
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
print(f"Debug Build: {debug_build}")

if sys.platform == "win32":
allocator_libs = []
allocator_libs = [] # type: ignore
extra_compile_args = ["/openmp", "/O2"]
extra_link_args = ["/openmp"]
# make sure we can find xsimd headers
Expand Down Expand Up @@ -157,7 +157,7 @@
],
package_dir={"": "src"},
packages=find_packages(where="src"),
install_requires=["numpy", "pandas", "scipy", "formulaic>=0.6"],
install_requires=["formulaic>=0.6", "numpy", "scipy"],
python_requires=">=3.9",
ext_modules=cythonize(
ext_modules,
Expand Down
3 changes: 2 additions & 1 deletion src/tabmat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import importlib.metadata

from .categorical_matrix import CategoricalMatrix
from .constructor import from_csc, from_formula, from_pandas
from .constructor import from_csc, from_formula, from_pandas, from_polars
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
Expand All @@ -23,6 +23,7 @@
"from_csc",
"from_formula",
"from_pandas",
"from_polars",
"as_tabmat",
"hstack",
]
135 changes: 101 additions & 34 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,15 @@ def matvec(mat, vec):

"""

import importlib.util
import re
import warnings
from typing import Optional, Union

import numpy as np
import pandas as pd
from scipy import sparse as sps

from .dense_matrix import DenseMatrix
from .ext.categorical import (
matvec_complex,
matvec_fast,
Expand All @@ -191,6 +193,20 @@ def matvec(mat, vec):
setup_restrictions,
)

if importlib.util.find_spec("pandas"):
import pandas as pd
if importlib.util.find_spec("polars"):
import polars as pl


class _Categorical:
"""This class helps us avoid copies while subsetting."""

def __init__(self, indices, categories, dtype):
self.indices = indices
self.categories = categories
self.dtype = dtype


def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]):
if isinstance(indexer, np.ndarray):
Expand All @@ -203,6 +219,38 @@ def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray])
return len(range(*indexer.indices(full_length))) == full_length


def _is_pandas(x) -> bool:
if importlib.util.find_spec("pandas"):
return isinstance(x, (pd.Categorical, pd.CategoricalDtype))
return False


def _is_polars(x) -> bool:
if importlib.util.find_spec("polars"):
return isinstance(x, (pl.Series, pl.Categorical, pl.Enum))
return False


def _extract_codes_and_categories(cat_vec):
if isinstance(cat_vec, _Categorical):
categories = cat_vec.categories
indices = cat_vec.indices
elif _is_pandas(cat_vec):
categories = cat_vec.categories.to_numpy()
indices = cat_vec.codes
elif _is_pandas(cat_vec.dtype):
categories = cat_vec.cat.categories.to_numpy()
indices = cat_vec.cat.codes.to_numpy()
elif _is_polars(cat_vec):
if not _is_polars(cat_vec.dtype):
cat_vec = cat_vec.cast(pl.Categorical)
categories = cat_vec.cat.get_categories().to_numpy()
indices = cat_vec.to_physical().fill_null(-1).to_numpy()
else:
indices, categories = pd.factorize(cat_vec, sort=True)
return indices, categories


def _row_col_indexing(
arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray]
) -> np.ndarray:
Expand Down Expand Up @@ -255,7 +303,7 @@ class CategoricalMatrix(MatrixBase):

def __init__(
self,
cat_vec: Union[list, np.ndarray, pd.Categorical],
cat_vec,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy might actually be useful here. It is quite good with union types and narrowing them in if-statements these days.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy doesn't play nice with optional imports. I tried conditioning imports on type hinting to no avail.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using TYPE_CHECKING didn't work? I think it is reasonable to assume both Pandas and Polars are installed in the dev environment, isn't it?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using TYPE_CHECKING didn't work?

No. :\ I may have done it wrong though.

drop_first: bool = False,
dtype: np.dtype = np.float64,
column_name: Optional[str] = None,
Expand All @@ -264,35 +312,40 @@ def __init__(
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
):
if cat_missing_method not in ["fail", "zero", "convert"]:
if cat_missing_method not in {"fail", "zero", "convert"}:
raise ValueError(
"cat_missing_method must be one of 'fail' 'zero' or 'convert', "
f" got {cat_missing_method}"
"cat_missing_method must be one of 'fail' 'zero' or 'convert'; "
f" got {cat_missing_method}."
)

if not hasattr(cat_vec, "dtype"):
cat_vec = np.array(cat_vec) # avoid errors in pd.factorize

self._input_dtype = cat_vec.dtype
self._missing_method = cat_missing_method
self._missing_category = cat_missing_name

if isinstance(cat_vec, pd.Categorical):
self.cat = cat_vec
else:
self.cat = pd.Categorical(cat_vec)
indices, self.categories = _extract_codes_and_categories(cat_vec)

if pd.isnull(self.cat).any():
if np.any(indices == -1):
if self._missing_method == "fail":
raise ValueError(
"Categorical data can't have missing values "
"if cat_missing_method='fail'."
)

elif self._missing_method == "convert":
if self._missing_category in self.cat.categories:
if self._missing_category in self.categories:
raise ValueError(
f"Missing category {self._missing_category} already exists."
)

self.cat = self.cat.add_categories([self._missing_category])
self.categories = np.hstack(
[self.categories, self._missing_category], dtype="object"
)

indices = np.where(indices < 0, len(self.categories) - 1, indices)

self.cat[pd.isnull(self.cat)] = self._missing_category
self._has_missings = False

else:
Expand All @@ -302,38 +355,56 @@ def __init__(
self._has_missings = False

self.drop_first = drop_first
self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first))
self.indices = self.cat.codes.astype(np.int32)
self.x_csc: Optional[tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
self.indices = indices.astype(np.int32, copy=False)
self.shape = (len(self.indices), len(self.categories) - int(drop_first))
self.x_csc = None
self.dtype = np.dtype(dtype)

self._colname = column_name
self._colname_format = column_name_format

if term_name is None:
self._term = self._colname
else:
self._term = term_name
self._colname_format = column_name_format

__array_ufunc__ = None

@property
def cat(self):
"""Return a series with same data as what was initially fed to __init__.

This property is available for backward compatibility.
"""
warnings.warn(
"This property will be removed in the next major release.",
category=DeprecationWarning,
)

if _is_polars(self._input_dtype):
Comment thread
cbourjau marked this conversation as resolved.
out = self.categories[self.indices].astype("object", copy=False)
out = np.where(self.indices < 0, None, out)
return pl.Series(out, dtype=pl.Enum(self.categories))

return pd.Categorical.from_codes(self.indices, categories=self.categories)

def recover_orig(self) -> np.ndarray:
"""
Return 1d numpy array with same data as what was initially fed to __init__.

Test: matrix/test_categorical_matrix::test_recover_orig
"""
orig = self.cat.categories[self.cat.codes].to_numpy()
orig = self.categories[self.indices]

if self._has_missings:
orig = orig.view(np.ma.MaskedArray)
orig.mask = self.cat.codes == -1
orig.mask = self.indices == -1
elif (
self._missing_method == "convert"
and self._missing_category in self.cat.categories
and self._missing_category in self.categories
):
orig = orig.view(np.ma.MaskedArray)
missing_code = self.cat.categories.get_loc(self._missing_category)
orig.mask = self.cat.codes == missing_code
orig.mask = self.indices == len(self.categories) - 1

return orig

Expand Down Expand Up @@ -529,8 +600,6 @@ def _cross_sandwich(
R_cols: Optional[np.ndarray] = None,
) -> np.ndarray:
"""Perform a sandwich product: X.T @ diag(d) @ Y."""
from .dense_matrix import DenseMatrix

if isinstance(other, DenseMatrix):
return self._cross_dense(other._array, d, rows, L_cols, R_cols)
if isinstance(other, SparseMatrix):
Expand Down Expand Up @@ -576,8 +645,6 @@ def tocsr(self) -> sps.csr_matrix:

def to_sparse_matrix(self):
"""Return a tabmat.SparseMatrix representation."""
from .sparse_matrix import SparseMatrix

return SparseMatrix(
self.tocsr(),
column_names=self.column_names,
Expand All @@ -594,7 +661,7 @@ def unpack(self):

def astype(self, dtype, order="K", casting="unsafe", copy=True):
"""Return CategoricalMatrix cast to new type."""
self.dtype = dtype
self.dtype = np.dtype(dtype)
return self

def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray:
Expand All @@ -613,7 +680,7 @@ def __getitem__(self, item):
if isinstance(row, np.ndarray):
row = row.ravel()
return CategoricalMatrix(
self.cat[row],
_Categorical(self.indices[row], self.categories, self._input_dtype),
drop_first=self.drop_first,
dtype=self.dtype,
column_name=self._colname,
Expand Down Expand Up @@ -745,7 +812,7 @@ def multiply(self, other) -> SparseMatrix:
)

def __repr__(self):
return str(self.cat)
return f"{self.__class__.__name__}\nCategories: {self.categories}"

def get_names(
self,
Expand Down Expand Up @@ -786,19 +853,19 @@ def get_names(
raise ValueError(f"Type must be 'column' or 'term', got {type}")

if indices is None:
indices = list(range(len(self.cat.categories) - self.drop_first))
indices = list(range(len(self.categories) - self.drop_first))
if name is None and missing_prefix is None:
return [None] * (len(self.cat.categories) - self.drop_first)
return [None] * (len(self.categories) - self.drop_first)
elif name is None:
name = f"{missing_prefix}{indices[0]}-{indices[-1]}"

if type == "column":
return [
self._colname_format.format(name=name, category=cat)
for cat in self.cat.categories[self.drop_first :]
for cat in self.categories[self.drop_first :]
]
else:
return [name] * (len(self.cat.categories) - self.drop_first)
return [name] * (len(self.categories) - self.drop_first)

def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"):
"""Set column names.
Expand All @@ -820,7 +887,7 @@ def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"
if type == "column":
# Try finding the column name
base_names = []
for name, cat in zip(names, self.cat.categories[self.drop_first :]):
for name, cat in zip(names, self.categories[self.drop_first :]):
partial_name = self._colname_format.format(
name="__CAPTURE__", category=cat
)
Expand Down
Loading