From 8c44035b5cec98b0bad825d4a79718a3db8a44f5 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Fri, 14 Jun 2024 14:59:36 +0100 Subject: [PATCH 01/13] Environment --- conda.recipe/meta.yaml | 3 ++- environment-win.yml | 5 ++++- environment.yml | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 7f5d81c9..3653bfd0 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,9 +36,10 @@ requirements: run: - python - {{ pin_compatible('numpy') }} + - formulaic>=0.6 - pandas + - polars - scipy - - formulaic>=0.6 test: requires: diff --git a/environment-win.yml b/environment-win.yml index dd88ce10..712f0b46 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -4,8 +4,11 @@ channels: - nodefaults dependencies: - libblas>=0=*mkl - - pandas - formulaic>=0.6 + - numpy + - pandas + - polars + - scipy # development tools - click diff --git a/environment.yml b/environment.yml index 10d7d402..c8184764 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,11 @@ channels: - conda-forge - nodefaults dependencies: - - pandas - formulaic>=0.6 + - numpy + - polars + - pandas + - scipy # development tools - click From 2532dc93ce140575fb8f5ed3ccb88bc62aeef3a7 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 09:54:00 +0100 Subject: [PATCH 02/13] Categorical matrix --- src/tabmat/categorical_matrix.py | 89 +++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 955fdf30..83e62b06 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -163,12 +163,15 @@ def matvec(mat, vec): """ import re +import warnings from typing import Optional, Union import numpy as np import pandas as pd +import polars as pl from scipy import sparse as sps +from .dense_matrix import DenseMatrix from .ext.categorical import ( matvec_complex, matvec_fast, @@ -255,7 +258,7 @@ class CategoricalMatrix(MatrixBase): def __init__( self, - cat_vec: Union[list, np.ndarray, pd.Categorical], + cat_vec: Union[list, np.ndarray, pd.Categorical, pd.Series, pl.Series], drop_first: bool = False, dtype: np.dtype = np.float64, column_name: Optional[str] = None, @@ -266,18 +269,31 @@ def __init__( ): if cat_missing_method not in ["fail", "zero", "convert"]: raise ValueError( - "cat_missing_method must be one of 'fail' 'zero' or 'convert', " - f" got {cat_missing_method}" + "cat_missing_method must be one of 'fail' 'zero' or 'convert'; " + f" got {cat_missing_method}." ) + self._missing_method = cat_missing_method self._missing_category = cat_missing_name + if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)): + cat_vec = np.asanyarray(cat_vec) + if isinstance(cat_vec, pd.Categorical): - self.cat = cat_vec + self.categories = cat_vec.categories.to_numpy() + indices = cat_vec.codes + elif isinstance(cat_vec.dtype, pd.CategoricalDtype): + self.categories = cat_vec.cat.categories.to_numpy() + indices = cat_vec.cat.codes.to_numpy() + elif isinstance(cat_vec, pl.Series): + if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)): + cat_vec = cat_vec.cast(pl.Categorical) + self.categories = cat_vec.cat.get_categories().to_numpy() + indices = cat_vec.to_physical().fill_null(-1).to_numpy() else: - self.cat = pd.Categorical(cat_vec) + indices, self.categories = pd.factorize(cat_vec, sort=True) - if pd.isnull(self.cat).any(): + if np.any(indices == -1): if self._missing_method == "fail": raise ValueError( "Categorical data can't have missing values " @@ -285,14 +301,17 @@ def __init__( ) elif self._missing_method == "convert": - if self._missing_category in self.cat.categories: + if self._missing_category in self.categories: raise ValueError( f"Missing category {self._missing_category} already exists." ) - self.cat = self.cat.add_categories([self._missing_category]) + self.categories = np.hstack( + [self.categories, self._missing_category], dtype="object" + ) + + indices = np.where(indices < 0, len(self.categories) - 1, indices) - self.cat[pd.isnull(self.cat)] = self._missing_category self._has_missings = False else: @@ -302,38 +321,50 @@ def __init__( self._has_missings = False self.drop_first = drop_first - self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first)) - self.indices = self.cat.codes.astype(np.int32) - self.x_csc: Optional[tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None + self.indices = indices.astype(np.int32) + self.shape = (len(self.indices), len(self.categories) - int(drop_first)) + self.x_csc = None self.dtype = np.dtype(dtype) self._colname = column_name + self._colname_format = column_name_format + if term_name is None: self._term = self._colname else: self._term = term_name - self._colname_format = column_name_format __array_ufunc__ = None + @property + def cat(self): + """Return a pandas array with same data as what was initially fed to __init__. + + This property is available for backward compatibility. + """ + warnings.warn( + "This property will be removed in the next major release.", + category=DeprecationWarning, + ) + return pd.Categorical.from_codes(self.indices, categories=self.categories) + def recover_orig(self) -> np.ndarray: """ Return 1d numpy array with same data as what was initially fed to __init__. Test: matrix/test_categorical_matrix::test_recover_orig """ - orig = self.cat.categories[self.cat.codes].to_numpy() + orig = self.categories[self.indices] if self._has_missings: orig = orig.view(np.ma.MaskedArray) - orig.mask = self.cat.codes == -1 + orig.mask = self.indices == -1 elif ( self._missing_method == "convert" - and self._missing_category in self.cat.categories + and self._missing_category in self.categories ): orig = orig.view(np.ma.MaskedArray) - missing_code = self.cat.categories.get_loc(self._missing_category) - orig.mask = self.cat.codes == missing_code + orig.mask = self.indices == len(self.categories) - 1 return orig @@ -529,8 +560,6 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ Y.""" - from .dense_matrix import DenseMatrix - if isinstance(other, DenseMatrix): return self._cross_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, SparseMatrix): @@ -576,8 +605,6 @@ def tocsr(self) -> sps.csr_matrix: def to_sparse_matrix(self): """Return a tabmat.SparseMatrix representation.""" - from .sparse_matrix import SparseMatrix - return SparseMatrix( self.tocsr(), column_names=self.column_names, @@ -594,7 +621,7 @@ def unpack(self): def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return CategoricalMatrix cast to new type.""" - self.dtype = dtype + self.dtype = np.dtype(dtype) return self def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: @@ -613,7 +640,9 @@ def __getitem__(self, item): if isinstance(row, np.ndarray): row = row.ravel() return CategoricalMatrix( - self.cat[row], + pd.Categorical.from_codes( + self.indices[row], categories=self.categories + ), drop_first=self.drop_first, dtype=self.dtype, column_name=self._colname, @@ -745,7 +774,7 @@ def multiply(self, other) -> SparseMatrix: ) def __repr__(self): - return str(self.cat) + return f"{self.__class__.__name__}\nCategories: {self.categories}" def get_names( self, @@ -786,19 +815,19 @@ def get_names( raise ValueError(f"Type must be 'column' or 'term', got {type}") if indices is None: - indices = list(range(len(self.cat.categories) - self.drop_first)) + indices = list(range(len(self.categories) - self.drop_first)) if name is None and missing_prefix is None: - return [None] * (len(self.cat.categories) - self.drop_first) + return [None] * (len(self.categories) - self.drop_first) elif name is None: name = f"{missing_prefix}{indices[0]}-{indices[-1]}" if type == "column": return [ self._colname_format.format(name=name, category=cat) - for cat in self.cat.categories[self.drop_first :] + for cat in self.categories[self.drop_first :] ] else: - return [name] * (len(self.cat.categories) - self.drop_first) + return [name] * (len(self.categories) - self.drop_first) def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): """Set column names. @@ -820,7 +849,7 @@ def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column" if type == "column": # Try finding the column name base_names = [] - for name, cat in zip(names, self.cat.categories[self.drop_first :]): + for name, cat in zip(names, self.categories[self.drop_first :]): partial_name = self._colname_format.format( name="__CAPTURE__", category=cat ) From 207c548d8fdc44128777eabfcf3ae8192d166841 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 09:58:40 +0100 Subject: [PATCH 03/13] Constructor --- setup.py | 2 +- src/tabmat/__init__.py | 3 +- src/tabmat/constructor.py | 232 ++++++++++++++++++++++++++++++------- src/tabmat/dense_matrix.py | 4 +- 4 files changed, 197 insertions(+), 44 deletions(-) diff --git a/setup.py b/setup.py index 6baffa5b..3792c27a 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ print(f"Debug Build: {debug_build}") if sys.platform == "win32": - allocator_libs = [] + allocator_libs = [] # type: ignore extra_compile_args = ["/openmp", "/O2"] extra_link_args = ["/openmp"] # make sure we can find xsimd headers diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index bd7ce292..b88fa82f 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -1,7 +1,7 @@ import importlib.metadata from .categorical_matrix import CategoricalMatrix -from .constructor import from_csc, from_formula, from_pandas +from .constructor import from_csc, from_formula, from_pandas, from_polars from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix @@ -23,6 +23,7 @@ "from_csc", "from_formula", "from_pandas", + "from_polars", "as_tabmat", "hstack", ] diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index ae7fff5e..a337a67e 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -5,11 +5,12 @@ import numpy as np import pandas as pd +import polars as pl from formulaic import Formula, ModelSpec from formulaic.materializers.types import NAAction from formulaic.parser import DefaultFormulaParser from formulaic.utils.layered_mapping import LayeredMapping -from pandas.api.types import is_bool_dtype, is_numeric_dtype +from pandas.api.types import is_numeric_dtype from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix @@ -78,16 +79,15 @@ def from_pandas( indices: list[list[int]] = [] is_cat: list[bool] = [] - dense_dfidx = [] # column index in original DataFrame - dense_mxidx = [] # index in the new SplitMatrix - sparse_dfcols = [] # sparse columns to join together - sparse_mxidx = [] # index in the new SplitMatrix + dense_columns = [] # column index in original DataFrame + dense_indices = [] # index in the new SplitMatrix + sparse_columns = [] # sparse columns to join together + sparse_indices = [] # index in the new SplitMatrix ignored_cols = [] mxcolidx = 0 - for dfcolidx, (colname, coldata) in enumerate(df.items()): - # categorical + for colname, coldata in df.items(): if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): @@ -101,12 +101,12 @@ def from_pandas( cat_missing_method=cat_missing_method, cat_missing_name=cat_missing_name, ) - if len(coldata.cat.categories) < cat_threshold: + if len(cat.categories) < cat_threshold: ( X_dense_F, X_sparse, - dense_indices, - sparse_indices, + dense_idx, + sparse_idx, ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, @@ -118,12 +118,12 @@ def from_pandas( matrices.append(X_sparse) is_cat.append(True) if cat_position == "expand": - indices.append(mxcolidx + dense_indices) - indices.append(mxcolidx + sparse_indices) - mxcolidx += len(dense_indices) + len(sparse_indices) + indices.append(mxcolidx + dense_idx) + indices.append(mxcolidx + sparse_idx) + mxcolidx += len(dense_idx) + len(sparse_idx) elif cat_position == "end": - indices.append(dense_indices) - indices.append(sparse_indices) + indices.append(dense_idx) + indices.append(sparse_idx) else: matrices.append(cat) @@ -133,53 +133,205 @@ def from_pandas( mxcolidx += cat.shape[1] elif cat_position == "end": indices.append(np.arange(cat.shape[1])) - # All other numerical dtypes (needs to be after pd.SparseDtype) elif is_numeric_dtype(coldata): - # check if we want to store as sparse if (coldata != 0).mean() <= sparse_threshold: - if not isinstance(coldata.dtype, pd.SparseDtype): - fill_value = False if is_bool_dtype(coldata) else 0 # type: ignore - sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=fill_value) - sparse_dfcols.append(coldata.astype(sparse_dtype)) - else: - sparse_dfcols.append(coldata) - sparse_mxidx.append(mxcolidx) + sparse_columns.append(colname) + sparse_indices.append(mxcolidx) mxcolidx += 1 else: - dense_dfidx.append(dfcolidx) - dense_mxidx.append(mxcolidx) + dense_columns.append(colname) + dense_indices.append(mxcolidx) mxcolidx += 1 - # dtype not handled yet else: - ignored_cols.append((dfcolidx, colname)) + ignored_cols.append(colname) if len(ignored_cols) > 0: warnings.warn( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) - if len(dense_dfidx) > 0: + if dense_columns: matrices.append( DenseMatrix( - df.iloc[:, dense_dfidx].astype(dtype), - column_names=df.columns[dense_dfidx], - term_names=df.columns[dense_dfidx], + df[dense_columns].astype(dtype), + column_names=dense_columns, + term_names=dense_columns, ) ) - indices.append(dense_mxidx) + indices.append(dense_indices) is_cat.append(False) - if len(sparse_dfcols) > 0: - sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)} - full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo() + if sparse_columns: matrices.append( SparseMatrix( - full_sparse, + sps.coo_matrix(df[sparse_columns], dtype=dtype), dtype=dtype, - column_names=[col.name for col in sparse_dfcols], - term_names=[col.name for col in sparse_dfcols], + column_names=sparse_columns, + term_names=sparse_columns, ) ) - indices.append(sparse_mxidx) + indices.append(sparse_indices) + is_cat.append(False) + + if cat_position == "end": + new_indices = [] + for mat_indices, is_cat_ in zip(indices, is_cat): + if is_cat_: + new_indices.append(np.asarray(mat_indices) + mxcolidx) + mxcolidx += len(mat_indices) + else: + new_indices.append(mat_indices) + indices = new_indices + + if len(matrices) > 1: + return SplitMatrix(matrices, indices) + elif len(matrices) == 0: + raise ValueError("DataFrame contained no valid column") + else: + return matrices[0] + + +def from_polars( + df: pl.DataFrame, + dtype: np.dtype = np.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + cat_position: str = "expand", + drop_first: bool = False, + categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", +) -> MatrixBase: + """ + Transform a polars.DataFrame into an efficient SplitMatrix. For most users, this + will be the primary way to construct tabmat objects from their data. + + Parameters + ---------- + df : pl.DataFrame + Polars DataFrame to be converted. + dtype : np.dtype, default np.float64 + dtype of all sub-matrices of the resulting SplitMatrix. + sparse_threshold : float, default 0.1 + Density threshold below which numerical columns will be stored in a sparse + format. + cat_threshold : int, default 4 + Number of levels of a categorical column under which the column will be stored + as sparse one-hot-encoded columns instead of CategoricalMatrix + cat_position : str {'end'|'expand'}, default 'expand' + Position of the categorical variable in the index. If "last", all the + categoricals (including the ones that did not satisfy cat_threshold) + will be placed at the end of the index list. If "expand", all the variables + will remain in the same order. + drop_first : bool, default False + If true, categoricals variables will have their first category dropped. + This allows multiple categorical variables to be included in an + unregularized model. If False, all categories are included. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. + + Returns + ------- + SplitMatrix + """ + matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] + indices: list[list[int]] = [] + is_cat: list[bool] = [] + + dense_columns = [] # column index in original DataFrame + dense_indices = [] # index in the new SplitMatrix + sparse_columns = [] # sparse columns to join together + sparse_indices = [] # index in the new SplitMatrix + ignored_cols = [] + + mxcolidx = 0 + + for coldata in df.iter_columns(): + if isinstance(coldata.dtype, (pl.Categorical, pl.Enum)): + cat = CategoricalMatrix( + coldata, + drop_first=drop_first, + dtype=dtype, + column_name=coldata.name, + term_name=coldata.name, + column_name_format=categorical_format, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + if len(cat.categories) < cat_threshold: + ( + X_dense_F, + X_sparse, + dense_idx, + sparse_idx, + ) = _split_sparse_and_dense_parts( + sps.csc_matrix(cat.tocsr(), dtype=dtype), + threshold=sparse_threshold, + column_names=cat.get_names("column"), + term_names=cat.get_names("term"), + ) + matrices.append(X_dense_F) + is_cat.append(True) + matrices.append(X_sparse) + is_cat.append(True) + if cat_position == "expand": + indices.append(mxcolidx + dense_idx) + indices.append(mxcolidx + sparse_idx) + mxcolidx += len(dense_idx) + len(sparse_idx) + elif cat_position == "end": + indices.append(dense_idx) + indices.append(sparse_idx) + + else: + matrices.append(cat) + is_cat.append(True) + if cat_position == "expand": + indices.append(mxcolidx + np.arange(cat.shape[1])) + mxcolidx += cat.shape[1] + elif cat_position == "end": + indices.append(np.arange(cat.shape[1])) + elif coldata.dtype.is_numeric(): + if (coldata != 0).mean() <= sparse_threshold: + sparse_columns.append(coldata.name) + sparse_indices.append(mxcolidx) + mxcolidx += 1 + else: + dense_columns.append(coldata.name) + dense_indices.append(mxcolidx) + mxcolidx += 1 + + else: + ignored_cols.append(coldata.name) + + if len(ignored_cols) > 0: + warnings.warn( + f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." + ) + if dense_columns: + matrices.append( + DenseMatrix( + df[dense_columns].to_numpy().astype(dtype), + column_names=dense_columns, + term_names=dense_columns, + ) + ) + indices.append(dense_indices) + is_cat.append(False) + if sparse_columns: + matrices.append( + SparseMatrix( + sps.coo_matrix(df[sparse_columns], dtype=dtype), + dtype=dtype, + column_names=sparse_columns, + term_names=sparse_columns, + ) + ) + indices.append(sparse_indices) is_cat.append(False) if cat_position == "end": diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 5dc6a156..0dd27845 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -42,7 +42,7 @@ def __init__(self, input_array, column_names=None, term_names=None): elif input_array.ndim > 2: raise ValueError("Input array must be 1- or 2-dimensional") - self._array = np.asarray(input_array) + self._array = input_array width = self._array.shape[1] if column_names is not None: @@ -102,7 +102,7 @@ def ndim(self): @property def dtype(self): - """Data-type of the array’s elements.""" # noqa: D401 + """Data type of the array's elements.""" # noqa: D401 return self._array.dtype def transpose(self): From 7be43d02dbe6c890e7d5b007fa1947c0c927d1fd Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 09:59:36 +0100 Subject: [PATCH 04/13] Tests --- tests/test_categorical_matrix.py | 5 +- tests/test_constructor.py | 249 +++++++++++++++++++++++++++++++ tests/test_fast_sandwich.py | 4 +- tests/test_formula.py | 10 +- tests/test_matrices.py | 106 ------------- tests/test_split_matrix.py | 14 -- 6 files changed, 259 insertions(+), 129 deletions(-) create mode 100644 tests/test_constructor.py diff --git a/tests/test_categorical_matrix.py b/tests/test_categorical_matrix.py index 766576e1..5d74d676 100644 --- a/tests/test_categorical_matrix.py +++ b/tests/test_categorical_matrix.py @@ -18,11 +18,10 @@ def cat_vec(missing): return vec -@pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32]) @pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) @pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) -def test_recover_orig(cat_vec, vec_dtype, drop_first, missing, cat_missing_method): +def test_recover_orig(cat_vec, drop_first, missing, cat_missing_method): if missing and cat_missing_method == "fail": with pytest.raises( ValueError, match="Categorical data can't have missing values" @@ -173,7 +172,7 @@ def test_cat_missing_name(cat_missing_name): cat = CategoricalMatrix( vec, cat_missing_method="convert", cat_missing_name=cat_missing_name ) - assert set(cat.cat.categories) == set(vec) - {None} | {cat_missing_name} + assert set(cat.categories) == set(vec) - {None} | {cat_missing_name} @pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) diff --git a/tests/test_constructor.py b/tests/test_constructor.py new file mode 100644 index 00000000..0fc4b9fa --- /dev/null +++ b/tests/test_constructor.py @@ -0,0 +1,249 @@ +import numpy as np +import pandas as pd +import polars as pl +import pytest + +import tabmat as tm + + +def test_pandas_to_matrix(): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "ds": lowdense_ser, + "s": sparse_ser, + "cl_obj": cat_ser_lowdim.astype(object), + "ch": cat_ser_highdim, + } + ) + + mat = tm.from_pandas( + df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True + ) + + assert mat.shape == (n_rows, n_rows + 5) + assert len(mat.matrices) == 3 + assert isinstance(mat, tm.SplitMatrix) + + nb_col_by_type = { + tm.DenseMatrix: 3, # includes low-dimension categorical + tm.SparseMatrix: 2, # sparse column + tm.CategoricalMatrix: n_rows, + } + for submat in mat.matrices: + assert submat.shape[1] == nb_col_by_type[type(submat)] + + # Prevent a regression where the column type of sparsified dense columns + # was being changed in place. + assert df["cl_obj"].dtype == object + assert df["ds"].dtype == np.float64 + + +@pytest.mark.parametrize("categorical_dtype", [pl.Categorical, pl.Enum(["a", "b"])]) +def test_polars_to_matrix(categorical_dtype): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows).astype("str") + + dense_ser = pl.Series(dense_column) + lowdense_ser = pl.Series(dense_column_with_lots_of_zeros) + sparse_ser = pl.Series(sparse_column) + cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=categorical_dtype) + cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical) + + df = pl.DataFrame( + data={ + "d": dense_ser, + "ds": lowdense_ser, + "s": sparse_ser, + "cl": cat_ser_lowdim, + "ch": cat_ser_highdim, + } + ) + + mat = tm.from_polars(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4) + + assert mat.shape == (n_rows, n_rows + 5) + assert len(mat.matrices) == 3 + assert isinstance(mat, tm.SplitMatrix) + + nb_col_by_type = { + tm.DenseMatrix: 3, # includes low-dimension categorical + tm.SparseMatrix: 2, # sparse column + tm.CategoricalMatrix: n_rows, + } + for submat in mat.matrices: + assert submat.shape[1] == nb_col_by_type[type(submat)] + + # Prevent a regression where the column type of sparsified dense columns + # was being changed in place. + assert df["cl"].dtype == categorical_dtype + assert df["ds"].dtype == pl.Float64 + + +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_from_pandas_missing(cat_missing_method): + df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])}) + + if cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + tm.from_pandas(df, cat_missing_method=cat_missing_method) + elif cat_missing_method == "zero": + assert tm.from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2) + elif cat_missing_method == "convert": + assert tm.from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3) + + +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_from_polars_missing(cat_missing_method): + df = pl.DataFrame( + {"cat": pl.Series(["1", "2", None, "1", "2", None], dtype=pl.Categorical)} + ) + + if cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + tm.from_polars(df, cat_missing_method=cat_missing_method) + elif cat_missing_method == "zero": + assert tm.from_polars(df, cat_missing_method=cat_missing_method).shape == (6, 2) + elif cat_missing_method == "convert": + assert tm.from_polars(df, cat_missing_method=cat_missing_method).shape == (6, 3) + + +@pytest.mark.parametrize("prefix_sep", ["_", ": "]) +@pytest.mark.parametrize("drop_first", [True, False]) +def test_names_pandas(prefix_sep, drop_first): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "cl_obj": cat_ser_lowdim.astype(object), + "ch": cat_ser_highdim, + "ds": lowdense_ser, + "s": sparse_ser, + } + ) + + categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="end", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first) + assert mat_end.column_names == expanded_df.columns.tolist() + + mat_expand = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="expand", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + unique_terms = list(dict.fromkeys(mat_expand.term_names)) + assert unique_terms == df.columns.tolist() + + +@pytest.mark.parametrize("prefix_sep", ["_", ": "]) +@pytest.mark.parametrize("drop_first", [True, False]) +def test_names_polars(prefix_sep, drop_first): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows).astype("str") + + dense_ser = pl.Series(dense_column) + lowdense_ser = pl.Series(dense_column_with_lots_of_zeros) + sparse_ser = pl.Series(sparse_column) + cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=pl.Categorical) + cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical) + + df = pl.DataFrame( + data={ + "d": dense_ser, + "ds": lowdense_ser, + "s": sparse_ser, + "cl": cat_ser_lowdim, + "ch": cat_ser_highdim, + } + ) + + categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_polars( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + cat_position="end", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + expanded_df = pd.get_dummies( + df.to_pandas(), prefix_sep=prefix_sep, drop_first=drop_first + ) + assert mat_end.column_names == list(expanded_df.columns) + + mat_expand = tm.from_polars( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + cat_position="expand", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + unique_terms = list(dict.fromkeys(mat_expand.term_names)) + assert unique_terms == list(df.columns) diff --git a/tests/test_fast_sandwich.py b/tests/test_fast_sandwich.py index c22047ef..3b564cd4 100644 --- a/tests/test_fast_sandwich.py +++ b/tests/test_fast_sandwich.py @@ -30,7 +30,7 @@ def test_fast_sandwich_sparse(dtype): np.testing.assert_allclose(true, out, atol=np.sqrt(np.finfo(dtype).eps)) -@pytest.mark.high_memory +@pytest.mark.skip(reason="too heavy") def test_fast_sandwich_sparse_large(): # note that 50000 * 50000 > 2^31 - 1, so this will segfault when we index # with 32 bit integers (see GH #160) @@ -105,7 +105,7 @@ def simulate_matrix(nonzero_frac=0.05, shape=(100, 50), seed=0, dtype=np.float64 return A -@pytest.mark.high_memory +@pytest.mark.skip(reason="too heavy") @pytest.mark.parametrize("order", ["C", "F"]) def test_fast_sandwich_dense_large(order): # this will segfault when we index with 32 bit integers (see GH #270) diff --git a/tests/test_formula.py b/tests/test_formula.py index 6f558bec..5c5c5989 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -139,8 +139,8 @@ def test_retrieval(): "cat_1[b]:cat_3[1]", "cat_1[c]:cat_3[1]", "cat_1[a]:cat_3[2]", - "cat_1[c]:cat_3[2]", "cat_1[b]:cat_3[2]", + "cat_1[c]:cat_3[2]", ], ), drop_first=False, @@ -161,7 +161,8 @@ def test_matrix_against_expectation(df, formula, expected): if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): np.testing.assert_array_equal(res.toarray(), res.toarray()) elif isinstance(res, tm.CategoricalMatrix): - assert (exp.cat == res.cat).all() + np.testing.assert_array_equal(exp.categories, res.categories) + np.testing.assert_array_equal(exp.indices, res.indices) assert exp.drop_first == res.drop_first @@ -241,8 +242,8 @@ def test_matrix_against_expectation(df, formula, expected): "cat_1__b__x__cat_3__1", "cat_1__c__x__cat_3__1", "cat_1__a__x__cat_3__2", - "cat_1__c__x__cat_3__2", "cat_1__b__x__cat_3__2", + "cat_1__c__x__cat_3__2", ], ), drop_first=False, @@ -270,7 +271,8 @@ def test_matrix_against_expectation_qcl(df, formula, expected): if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): np.testing.assert_array_equal(res.toarray(), res.toarray()) elif isinstance(res, tm.CategoricalMatrix): - assert (exp.cat == res.cat).all() + np.testing.assert_array_equal(exp.categories, res.categories) + np.testing.assert_array_equal(exp.indices, res.indices) assert exp.drop_first == res.drop_first diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 9d7c3686..4409ed1c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -617,54 +617,6 @@ def test_indexing_ix_both(mat): np.testing.assert_array_equal(res, expected) -def test_pandas_to_matrix(): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) - dense_column_with_lots_of_zeros = dense_column.copy() - dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows) - - dense_ser = pd.Series(dense_column) - lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) - sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) - cat_ser_lowdim = pd.Categorical(cat_column_lowdim) - cat_ser_highdim = pd.Categorical(cat_column_highdim) - - df = pd.DataFrame( - data={ - "d": dense_ser, - "ds": lowdense_ser, - "s": sparse_ser, - "cl_obj": cat_ser_lowdim.astype(object), - "ch": cat_ser_highdim, - } - ) - - mat = tm.from_pandas( - df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True - ) - - assert mat.shape == (n_rows, n_rows + 5) - assert len(mat.matrices) == 3 - assert isinstance(mat, tm.SplitMatrix) - - nb_col_by_type = { - tm.DenseMatrix: 3, # includes low-dimension categorical - tm.SparseMatrix: 2, # sparse column - tm.CategoricalMatrix: n_rows, - } - for submat in mat.matrices: - assert submat.shape[1] == nb_col_by_type[type(submat)] - - # Prevent a regression where the column type of sparsified dense columns - # was being changed in place. - assert df["cl_obj"].dtype == object - assert df["ds"].dtype == np.float64 - - @pytest.mark.parametrize("mat", get_all_matrix_base_subclass_mats()) def test_split_matrix_creation(mat): sm = tm.SplitMatrix(matrices=[mat, mat]) @@ -839,61 +791,3 @@ def test_combine_names(mat_1, mat_2): assert combined.column_names == mat_1.column_names + mat_2.column_names assert combined.term_names == mat_1.term_names + mat_2.term_names - - -@pytest.mark.parametrize("prefix_sep", ["_", ": "]) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_names_pandas(prefix_sep, drop_first): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) - dense_column_with_lots_of_zeros = dense_column.copy() - dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows) - - dense_ser = pd.Series(dense_column) - lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) - sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) - cat_ser_lowdim = pd.Categorical(cat_column_lowdim) - cat_ser_highdim = pd.Categorical(cat_column_highdim) - - df = pd.DataFrame( - data={ - "d": dense_ser, - "cl_obj": cat_ser_lowdim.astype(object), - "ch": cat_ser_highdim, - "ds": lowdense_ser, - "s": sparse_ser, - } - ) - - categorical_format = "{name}" + prefix_sep + "{category}" - mat_end = tm.from_pandas( - df, - dtype=np.float64, - sparse_threshold=0.3, - cat_threshold=4, - object_as_cat=True, - cat_position="end", - categorical_format=categorical_format, - drop_first=drop_first, - ) - - expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first) - assert mat_end.column_names == expanded_df.columns.tolist() - - mat_expand = tm.from_pandas( - df, - dtype=np.float64, - sparse_threshold=0.3, - cat_threshold=4, - object_as_cat=True, - cat_position="expand", - categorical_format=categorical_format, - drop_first=drop_first, - ) - - unique_terms = list(dict.fromkeys(mat_expand.term_names)) - assert unique_terms == df.columns.tolist() diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index 7b1b3ed2..da8d2452 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -306,17 +306,3 @@ def test_matvec(n_rows): ) mat = from_pandas(X, cat_threshold=0) np.testing.assert_allclose(mat.matvec(np.array(mat.shape[1] * [1])), n_cols) - - -@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) -def test_from_pandas_missing(cat_missing_method): - df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])}) - if cat_missing_method == "fail": - with pytest.raises( - ValueError, match="Categorical data can't have missing values" - ): - from_pandas(df, cat_missing_method=cat_missing_method) - elif cat_missing_method == "zero": - assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2) - elif cat_missing_method == "convert": - assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3) From f80a9770d9a8d24be94e70064ceb6e3d7e9300c0 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 10:03:39 +0100 Subject: [PATCH 05/13] Change log --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d8d463fc..c94cb673 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,10 @@ Changelog 4.0.1 - 2024-06-25 ------------------ +**New features:** + +- Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`. + **Other changes:** - Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``. From fd872c182f750f30dbf6a75df338fc896c510dae Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 10:10:27 +0100 Subject: [PATCH 06/13] Patch --- CHANGELOG.rst | 2 +- environment-win.yml | 1 + environment.yml | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c94cb673..6c41748d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,7 +10,7 @@ Changelog 4.0.1 - 2024-06-25 ------------------ -**New features:** +**New feature:** - Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`. diff --git a/environment-win.yml b/environment-win.yml index 712f0b46..82b88bd1 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -8,6 +8,7 @@ dependencies: - numpy - pandas - polars + - pyarrow # exclusively for polars tests - scipy # development tools diff --git a/environment.yml b/environment.yml index c8184764..196c5460 100644 --- a/environment.yml +++ b/environment.yml @@ -5,8 +5,9 @@ channels: dependencies: - formulaic>=0.6 - numpy - - polars - pandas + - polars + - pyarrow # exclusively for polars tests - scipy # development tools From f1728f3c28cb8b4aa4f06e5c23fab0711569d689 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Mon, 17 Jun 2024 13:56:37 +0100 Subject: [PATCH 07/13] Dependency --- CHANGELOG.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6c41748d..4befa7e5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,7 +17,7 @@ Changelog **Other changes:** - Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``. -- Add support between formulaic and pandas 3.0 +- Add support between formulaic and pandas 3.0. - Support pypi release for numpy 2.0 4.0.0 - 2024-04-23 diff --git a/setup.py b/setup.py index 3792c27a..35e34e0c 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "scipy", "formulaic>=0.6"], + install_requires=["numpy", "pandas", "polars", "scipy", "formulaic>=0.6"], python_requires=">=3.9", ext_modules=cythonize( ext_modules, From 530b519b1ef68877156227a485829125850a6c4d Mon Sep 17 00:00:00 2001 From: lbittarello Date: Tue, 18 Jun 2024 12:22:19 +0100 Subject: [PATCH 08/13] Helpers --- src/tabmat/constructor.py | 80 +++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index a337a67e..cce46433 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -151,36 +151,16 @@ def from_pandas( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) if dense_columns: - matrices.append( - DenseMatrix( - df[dense_columns].astype(dtype), - column_names=dense_columns, - term_names=dense_columns, - ) - ) + matrices.append(_dense_matrix(df, dense_columns, dtype)) indices.append(dense_indices) is_cat.append(False) if sparse_columns: - matrices.append( - SparseMatrix( - sps.coo_matrix(df[sparse_columns], dtype=dtype), - dtype=dtype, - column_names=sparse_columns, - term_names=sparse_columns, - ) - ) + matrices.append(_sparse_matrix(df, sparse_columns, dtype)) indices.append(sparse_indices) is_cat.append(False) if cat_position == "end": - new_indices = [] - for mat_indices, is_cat_ in zip(indices, is_cat): - if is_cat_: - new_indices.append(np.asarray(mat_indices) + mxcolidx) - mxcolidx += len(mat_indices) - else: - new_indices.append(mat_indices) - indices = new_indices + indices = _reindex_cat(indices, is_cat, mxcolidx) if len(matrices) > 1: return SplitMatrix(matrices, indices) @@ -313,36 +293,16 @@ def from_polars( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) if dense_columns: - matrices.append( - DenseMatrix( - df[dense_columns].to_numpy().astype(dtype), - column_names=dense_columns, - term_names=dense_columns, - ) - ) + matrices.append(_dense_matrix(df, dense_columns, dtype)) indices.append(dense_indices) is_cat.append(False) if sparse_columns: - matrices.append( - SparseMatrix( - sps.coo_matrix(df[sparse_columns], dtype=dtype), - dtype=dtype, - column_names=sparse_columns, - term_names=sparse_columns, - ) - ) + matrices.append(_sparse_matrix(df, sparse_columns, dtype)) indices.append(sparse_indices) is_cat.append(False) if cat_position == "end": - new_indices = [] - for mat_indices, is_cat_ in zip(indices, is_cat): - if is_cat_: - new_indices.append(np.asarray(mat_indices) + mxcolidx) - mxcolidx += len(mat_indices) - else: - new_indices.append(mat_indices) - indices = new_indices + indices = _reindex_cat(indices, is_cat, mxcolidx) if len(matrices) > 1: return SplitMatrix(matrices, indices) @@ -352,6 +312,34 @@ def from_polars( return matrices[0] +def _dense_matrix(df, dense_columns, dtype): + return DenseMatrix( + df[dense_columns].to_numpy().astype(dtype), + column_names=dense_columns, + term_names=dense_columns, + ) + + +def _reindex_cat(indices, is_cat, mxcolidx): + new_indices = [] + for mat_indices, is_cat_ in zip(indices, is_cat): + if is_cat_: + new_indices.append(np.asarray(mat_indices) + mxcolidx) + mxcolidx = mxcolidx + len(mat_indices) + else: + new_indices.append(mat_indices) + return new_indices + + +def _sparse_matrix(df, sparse_columns, dtype): + return SparseMatrix( + sps.coo_matrix(df[sparse_columns], dtype=dtype), + dtype=dtype, + column_names=sparse_columns, + term_names=sparse_columns, + ) + + def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. From 90e43cbdcb7b6d2d27fd624191a27e827e83c1cf Mon Sep 17 00:00:00 2001 From: lbittarello Date: Tue, 18 Jun 2024 12:37:34 +0100 Subject: [PATCH 09/13] Simplify tests --- tests/test_constructor.py | 147 ++++++++++++-------------------------- 1 file changed, 46 insertions(+), 101 deletions(-) diff --git a/tests/test_constructor.py b/tests/test_constructor.py index 0fc4b9fa..ea5c6c4c 100644 --- a/tests/test_constructor.py +++ b/tests/test_constructor.py @@ -5,93 +5,84 @@ import tabmat as tm +N_ROWS = 50 -def test_pandas_to_matrix(): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + +def construct_data(backend): + dense_column = np.linspace(-10, 10, num=N_ROWS, dtype=np.float64) dense_column_with_lots_of_zeros = dense_column.copy() dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column = np.zeros(N_ROWS, dtype=np.float64) sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows) - - dense_ser = pd.Series(dense_column) - lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) - sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) - cat_ser_lowdim = pd.Categorical(cat_column_lowdim) - cat_ser_highdim = pd.Categorical(cat_column_highdim) - - df = pd.DataFrame( - data={ - "d": dense_ser, - "ds": lowdense_ser, - "s": sparse_ser, - "cl_obj": cat_ser_lowdim.astype(object), - "ch": cat_ser_highdim, - } - ) + cat_column_lowdim = np.tile(["a", "b"], N_ROWS // 2) + cat_column_highdim = np.arange(N_ROWS) + + data = { + "d": dense_column, + "ds": dense_column_with_lots_of_zeros, + "s": sparse_column, + "cl": cat_column_lowdim, + "ch": cat_column_highdim, + } + + if backend == "pandas": + data["s"] = pd.Series(data["s"], dtype=pd.SparseDtype("float", 0.0)) + data["cl"] = cat_column_lowdim.astype("object") + data["ch"] = pd.Categorical(cat_column_highdim) + + return pd.DataFrame(data) + + if backend == "polars": + data["cl"] = pl.Series(cat_column_lowdim, dtype=pl.Categorical) + data["ch"] = pl.Series(cat_column_highdim.astype("str"), dtype=pl.Categorical) + + return pl.DataFrame(data) + + raise ValueError + + +def test_pandas_to_matrix(): + df = construct_data("pandas") mat = tm.from_pandas( df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True ) - assert mat.shape == (n_rows, n_rows + 5) + assert mat.shape == (N_ROWS, N_ROWS + 5) assert len(mat.matrices) == 3 assert isinstance(mat, tm.SplitMatrix) nb_col_by_type = { tm.DenseMatrix: 3, # includes low-dimension categorical tm.SparseMatrix: 2, # sparse column - tm.CategoricalMatrix: n_rows, + tm.CategoricalMatrix: N_ROWS, } + for submat in mat.matrices: assert submat.shape[1] == nb_col_by_type[type(submat)] # Prevent a regression where the column type of sparsified dense columns # was being changed in place. - assert df["cl_obj"].dtype == object + assert df["cl"].dtype == object assert df["ds"].dtype == np.float64 @pytest.mark.parametrize("categorical_dtype", [pl.Categorical, pl.Enum(["a", "b"])]) def test_polars_to_matrix(categorical_dtype): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) - dense_column_with_lots_of_zeros = dense_column.copy() - dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows).astype("str") - - dense_ser = pl.Series(dense_column) - lowdense_ser = pl.Series(dense_column_with_lots_of_zeros) - sparse_ser = pl.Series(sparse_column) - cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=categorical_dtype) - cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical) - - df = pl.DataFrame( - data={ - "d": dense_ser, - "ds": lowdense_ser, - "s": sparse_ser, - "cl": cat_ser_lowdim, - "ch": cat_ser_highdim, - } - ) + df = construct_data("polars").with_columns(cl=pl.col("cl").cast(categorical_dtype)) mat = tm.from_polars(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4) - assert mat.shape == (n_rows, n_rows + 5) + assert mat.shape == (N_ROWS, N_ROWS + 5) assert len(mat.matrices) == 3 assert isinstance(mat, tm.SplitMatrix) nb_col_by_type = { tm.DenseMatrix: 3, # includes low-dimension categorical tm.SparseMatrix: 2, # sparse column - tm.CategoricalMatrix: n_rows, + tm.CategoricalMatrix: N_ROWS, } + for submat in mat.matrices: assert submat.shape[1] == nb_col_by_type[type(submat)] @@ -136,32 +127,9 @@ def test_from_polars_missing(cat_missing_method): @pytest.mark.parametrize("prefix_sep", ["_", ": "]) @pytest.mark.parametrize("drop_first", [True, False]) def test_names_pandas(prefix_sep, drop_first): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) - dense_column_with_lots_of_zeros = dense_column.copy() - dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows) - - dense_ser = pd.Series(dense_column) - lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) - sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) - cat_ser_lowdim = pd.Categorical(cat_column_lowdim) - cat_ser_highdim = pd.Categorical(cat_column_highdim) - - df = pd.DataFrame( - data={ - "d": dense_ser, - "cl_obj": cat_ser_lowdim.astype(object), - "ch": cat_ser_highdim, - "ds": lowdense_ser, - "s": sparse_ser, - } - ) - + df = construct_data("pandas") categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_pandas( df, dtype=np.float64, @@ -194,32 +162,9 @@ def test_names_pandas(prefix_sep, drop_first): @pytest.mark.parametrize("prefix_sep", ["_", ": "]) @pytest.mark.parametrize("drop_first", [True, False]) def test_names_polars(prefix_sep, drop_first): - n_rows = 50 - dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) - dense_column_with_lots_of_zeros = dense_column.copy() - dense_column_with_lots_of_zeros[:44] = 0.0 - sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[0] = 1.0 - cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) - cat_column_highdim = np.arange(n_rows).astype("str") - - dense_ser = pl.Series(dense_column) - lowdense_ser = pl.Series(dense_column_with_lots_of_zeros) - sparse_ser = pl.Series(sparse_column) - cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=pl.Categorical) - cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical) - - df = pl.DataFrame( - data={ - "d": dense_ser, - "ds": lowdense_ser, - "s": sparse_ser, - "cl": cat_ser_lowdim, - "ch": cat_ser_highdim, - } - ) - + df = construct_data("polars") categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_polars( df, dtype=np.float64, From 9100ddcea99263e77bf6444a205dc502a376fe9b Mon Sep 17 00:00:00 2001 From: lbittarello Date: Tue, 18 Jun 2024 14:58:21 +0100 Subject: [PATCH 10/13] It's all optional --- conda.recipe/meta.yaml | 2 -- setup.py | 2 +- src/tabmat/categorical_matrix.py | 56 ++++++++++++++++++++++++++------ src/tabmat/constructor.py | 15 +++++---- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 3653bfd0..7636b11b 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -37,8 +37,6 @@ requirements: - python - {{ pin_compatible('numpy') }} - formulaic>=0.6 - - pandas - - polars - scipy test: diff --git a/setup.py b/setup.py index 35e34e0c..324d5f70 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "polars", "scipy", "formulaic>=0.6"], + install_requires=["formulaic>=0.6", "numpy", "pandas", "polars", "scipy"], python_requires=">=3.9", ext_modules=cythonize( ext_modules, diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 83e62b06..cb149de7 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -162,6 +162,7 @@ def matvec(mat, vec): """ +import importlib.util import re import warnings from typing import Optional, Union @@ -194,6 +195,20 @@ def matvec(mat, vec): setup_restrictions, ) +if importlib.util.find_spec("pandas"): + import pandas as pd +if importlib.util.find_spec("polars"): + import polars as pl + + +class _Categorical: + """This class helps us avoid copies while subsetting.""" + + def __init__(self, indices, categories, input_type): + self.indices = indices + self.categories = categories + self.input_type = input_type + def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): if isinstance(indexer, np.ndarray): @@ -206,6 +221,18 @@ def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]) return len(range(*indexer.indices(full_length))) == full_length +def _is_pandas(x) -> bool: + if importlib.util.find_spec("pandas"): + return isinstance(x, (pd.Categorical, pd.CategoricalDtype)) + return False + + +def _is_polars(x) -> bool: + if importlib.util.find_spec("polars"): + return isinstance(x, (pl.Series, pl.Categorical, pl.Enum)) + return False + + def _row_col_indexing( arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray] ) -> np.ndarray: @@ -258,7 +285,7 @@ class CategoricalMatrix(MatrixBase): def __init__( self, - cat_vec: Union[list, np.ndarray, pd.Categorical, pd.Series, pl.Series], + cat_vec, drop_first: bool = False, dtype: np.dtype = np.float64, column_name: Optional[str] = None, @@ -273,20 +300,25 @@ def __init__( f" got {cat_missing_method}." ) + self._input_type = cat_vec.dtype self._missing_method = cat_missing_method self._missing_category = cat_missing_name if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)): cat_vec = np.asanyarray(cat_vec) - if isinstance(cat_vec, pd.Categorical): + if isinstance(cat_vec, _Categorical): + indices = cat_vec.indices + self.categories = cat_vec.categories + self._input_type = cat_vec.input_type + elif _is_pandas(cat_vec): self.categories = cat_vec.categories.to_numpy() indices = cat_vec.codes - elif isinstance(cat_vec.dtype, pd.CategoricalDtype): + elif _is_pandas(cat_vec.dtype): self.categories = cat_vec.cat.categories.to_numpy() indices = cat_vec.cat.codes.to_numpy() - elif isinstance(cat_vec, pl.Series): - if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)): + elif _is_polars(cat_vec): + if not _is_polars(cat_vec.dtype): cat_vec = cat_vec.cast(pl.Categorical) self.categories = cat_vec.cat.get_categories().to_numpy() indices = cat_vec.to_physical().fill_null(-1).to_numpy() @@ -321,7 +353,7 @@ def __init__( self._has_missings = False self.drop_first = drop_first - self.indices = indices.astype(np.int32) + self.indices = indices.astype(np.int32, copy=False) self.shape = (len(self.indices), len(self.categories) - int(drop_first)) self.x_csc = None self.dtype = np.dtype(dtype) @@ -338,7 +370,7 @@ def __init__( @property def cat(self): - """Return a pandas array with same data as what was initially fed to __init__. + """Return a series with same data as what was initially fed to __init__. This property is available for backward compatibility. """ @@ -346,6 +378,12 @@ def cat(self): "This property will be removed in the next major release.", category=DeprecationWarning, ) + + if _is_polars(self._input_type): + out = self.categories[self.indices].astype("object", copy=False) + out = np.where(self.indices < 0, None, out) + return pl.Series(out, dtype=pl.Enum(self.categories)) + return pd.Categorical.from_codes(self.indices, categories=self.categories) def recover_orig(self) -> np.ndarray: @@ -640,9 +678,7 @@ def __getitem__(self, item): if isinstance(row, np.ndarray): row = row.ravel() return CategoricalMatrix( - pd.Categorical.from_codes( - self.indices[row], categories=self.categories - ), + _Categorical(self.indices[row], self.categories, self._input_type), drop_first=self.drop_first, dtype=self.dtype, column_name=self._colname, diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index cce46433..7011e181 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -4,13 +4,10 @@ from typing import Any, Optional, Union import numpy as np -import pandas as pd -import polars as pl from formulaic import Formula, ModelSpec from formulaic.materializers.types import NAAction from formulaic.parser import DefaultFormulaParser from formulaic.utils.layered_mapping import LayeredMapping -from pandas.api.types import is_numeric_dtype from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix @@ -23,7 +20,7 @@ def from_pandas( - df: pd.DataFrame, + df, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, @@ -75,6 +72,8 @@ def from_pandas( ------- SplitMatrix """ + import pandas as pd + matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] indices: list[list[int]] = [] is_cat: list[bool] = [] @@ -133,7 +132,7 @@ def from_pandas( mxcolidx += cat.shape[1] elif cat_position == "end": indices.append(np.arange(cat.shape[1])) - elif is_numeric_dtype(coldata): + elif pd.api.types.is_numeric_dtype(coldata): if (coldata != 0).mean() <= sparse_threshold: sparse_columns.append(colname) sparse_indices.append(mxcolidx) @@ -171,7 +170,7 @@ def from_pandas( def from_polars( - df: pl.DataFrame, + df, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, @@ -219,6 +218,8 @@ def from_polars( ------- SplitMatrix """ + import polars as pl + matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] indices: list[list[int]] = [] is_cat: list[bool] = [] @@ -353,7 +354,7 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=N def from_formula( formula: Union[str, Formula], - data: pd.DataFrame, + data, ensure_full_rank: bool = False, na_action: Union[str, NAAction] = NAAction.IGNORE, dtype: np.dtype = np.float64, From 7b2129603617e06844275d2575731b2d18d6efa2 Mon Sep 17 00:00:00 2001 From: lbittarello Date: Tue, 18 Jun 2024 15:07:43 +0100 Subject: [PATCH 11/13] Patch --- setup.py | 2 +- src/tabmat/categorical_matrix.py | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 324d5f70..10a0a807 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["formulaic>=0.6", "numpy", "pandas", "polars", "scipy"], + install_requires=["formulaic>=0.6", "numpy", "scipy"], python_requires=">=3.9", ext_modules=cythonize( ext_modules, diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index cb149de7..684704e9 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -168,8 +168,6 @@ def matvec(mat, vec): from typing import Optional, Union import numpy as np -import pandas as pd -import polars as pl from scipy import sparse as sps from .dense_matrix import DenseMatrix @@ -204,10 +202,10 @@ def matvec(mat, vec): class _Categorical: """This class helps us avoid copies while subsetting.""" - def __init__(self, indices, categories, input_type): + def __init__(self, indices, categories, dtype): self.indices = indices self.categories = categories - self.input_type = input_type + self.dtype = dtype def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): @@ -294,23 +292,23 @@ def __init__( cat_missing_method: str = "fail", cat_missing_name: str = "(MISSING)", ): - if cat_missing_method not in ["fail", "zero", "convert"]: + if cat_missing_method not in {"fail", "zero", "convert"}: raise ValueError( "cat_missing_method must be one of 'fail' 'zero' or 'convert'; " f" got {cat_missing_method}." ) - self._input_type = cat_vec.dtype + if not hasattr(cat_vec, "dtype"): + cat_vec = np.array(cat_vec) # avoid errors in pd.factorize + + self._input_dtype = cat_vec.dtype self._missing_method = cat_missing_method self._missing_category = cat_missing_name - if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)): - cat_vec = np.asanyarray(cat_vec) - if isinstance(cat_vec, _Categorical): indices = cat_vec.indices self.categories = cat_vec.categories - self._input_type = cat_vec.input_type + self._input_dtype = cat_vec.dtype elif _is_pandas(cat_vec): self.categories = cat_vec.categories.to_numpy() indices = cat_vec.codes @@ -379,7 +377,7 @@ def cat(self): category=DeprecationWarning, ) - if _is_polars(self._input_type): + if _is_polars(self._input_dtype): out = self.categories[self.indices].astype("object", copy=False) out = np.where(self.indices < 0, None, out) return pl.Series(out, dtype=pl.Enum(self.categories)) @@ -678,7 +676,7 @@ def __getitem__(self, item): if isinstance(row, np.ndarray): row = row.ravel() return CategoricalMatrix( - _Categorical(self.indices[row], self.categories, self._input_type), + _Categorical(self.indices[row], self.categories, self._input_dtype), drop_first=self.drop_first, dtype=self.dtype, column_name=self._colname, From e2d059e7966924af6a4f69de349385cd1033072d Mon Sep 17 00:00:00 2001 From: lbittarello Date: Wed, 19 Jun 2024 15:29:30 +0100 Subject: [PATCH 12/13] Docstrings [skip ci] --- src/tabmat/constructor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 7011e181..c7e82f0e 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -32,13 +32,12 @@ def from_pandas( cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ - Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this - will be the primary way to construct tabmat objects from their data. + Transform a pandas.DataFrame into an efficient SplitMatrix. Parameters ---------- df : pd.DataFrame - pandas DataFrame to be converted. + pandas DataFrame to convert. dtype : np.dtype, default np.float64 dtype of all sub-matrices of the resulting SplitMatrix. sparse_threshold : float, default 0.1 @@ -181,13 +180,12 @@ def from_polars( cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ - Transform a polars.DataFrame into an efficient SplitMatrix. For most users, this - will be the primary way to construct tabmat objects from their data. + Transform a polars.DataFrame into an efficient SplitMatrix. Parameters ---------- df : pl.DataFrame - Polars DataFrame to be converted. + Polars DataFrame to convert. dtype : np.dtype, default np.float64 dtype of all sub-matrices of the resulting SplitMatrix. sparse_threshold : float, default 0.1 From f83e918b60af44614933099da9b34ecf0b1c5bcc Mon Sep 17 00:00:00 2001 From: lbittarello Date: Tue, 25 Jun 2024 09:57:01 +0100 Subject: [PATCH 13/13] Helper function --- src/tabmat/categorical_matrix.py | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 684704e9..33269439 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -231,6 +231,26 @@ def _is_polars(x) -> bool: return False +def _extract_codes_and_categories(cat_vec): + if isinstance(cat_vec, _Categorical): + categories = cat_vec.categories + indices = cat_vec.indices + elif _is_pandas(cat_vec): + categories = cat_vec.categories.to_numpy() + indices = cat_vec.codes + elif _is_pandas(cat_vec.dtype): + categories = cat_vec.cat.categories.to_numpy() + indices = cat_vec.cat.codes.to_numpy() + elif _is_polars(cat_vec): + if not _is_polars(cat_vec.dtype): + cat_vec = cat_vec.cast(pl.Categorical) + categories = cat_vec.cat.get_categories().to_numpy() + indices = cat_vec.to_physical().fill_null(-1).to_numpy() + else: + indices, categories = pd.factorize(cat_vec, sort=True) + return indices, categories + + def _row_col_indexing( arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray] ) -> np.ndarray: @@ -305,23 +325,7 @@ def __init__( self._missing_method = cat_missing_method self._missing_category = cat_missing_name - if isinstance(cat_vec, _Categorical): - indices = cat_vec.indices - self.categories = cat_vec.categories - self._input_dtype = cat_vec.dtype - elif _is_pandas(cat_vec): - self.categories = cat_vec.categories.to_numpy() - indices = cat_vec.codes - elif _is_pandas(cat_vec.dtype): - self.categories = cat_vec.cat.categories.to_numpy() - indices = cat_vec.cat.codes.to_numpy() - elif _is_polars(cat_vec): - if not _is_polars(cat_vec.dtype): - cat_vec = cat_vec.cast(pl.Categorical) - self.categories = cat_vec.cat.get_categories().to_numpy() - indices = cat_vec.to_physical().fill_null(-1).to_numpy() - else: - indices, self.categories = pd.factorize(cat_vec, sort=True) + indices, self.categories = _extract_codes_and_categories(cat_vec) if np.any(indices == -1): if self._missing_method == "fail":