From 8c44035b5cec98b0bad825d4a79718a3db8a44f5 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Fri, 14 Jun 2024 14:59:36 +0100
Subject: [PATCH 01/13] Environment

---
 conda.recipe/meta.yaml | 3 ++-
 environment-win.yml    | 5 ++++-
 environment.yml        | 5 ++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 7f5d81c9..3653bfd0 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -36,9 +36,10 @@ requirements:
   run:
     - python
     - {{ pin_compatible('numpy') }}
+    - formulaic>=0.6
     - pandas
+    - polars
     - scipy
-    - formulaic>=0.6
 
 test:
   requires:
diff --git a/environment-win.yml b/environment-win.yml
index dd88ce10..712f0b46 100644
--- a/environment-win.yml
+++ b/environment-win.yml
@@ -4,8 +4,11 @@ channels:
   - nodefaults
 dependencies:
   - libblas>=0=*mkl
-  - pandas
   - formulaic>=0.6
+  - numpy
+  - pandas
+  - polars
+  - scipy
 
   # development tools
   - click
diff --git a/environment.yml b/environment.yml
index 10d7d402..c8184764 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,8 +3,11 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - pandas
   - formulaic>=0.6
+  - numpy
+  - polars
+  - pandas
+  - scipy
 
   # development tools
   - click

From 2532dc93ce140575fb8f5ed3ccb88bc62aeef3a7 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 09:54:00 +0100
Subject: [PATCH 02/13] Categorical matrix

---
 src/tabmat/categorical_matrix.py | 89 +++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 955fdf30..83e62b06 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -163,12 +163,15 @@ def matvec(mat, vec):
 """
 
 import re
+import warnings
 from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
+import polars as pl
 from scipy import sparse as sps
 
+from .dense_matrix import DenseMatrix
 from .ext.categorical import (
     matvec_complex,
     matvec_fast,
@@ -255,7 +258,7 @@ class CategoricalMatrix(MatrixBase):
 
     def __init__(
         self,
-        cat_vec: Union[list, np.ndarray, pd.Categorical],
+        cat_vec: Union[list, np.ndarray, pd.Categorical, pd.Series, pl.Series],
         drop_first: bool = False,
         dtype: np.dtype = np.float64,
         column_name: Optional[str] = None,
@@ -266,18 +269,31 @@ def __init__(
     ):
         if cat_missing_method not in ["fail", "zero", "convert"]:
             raise ValueError(
-                "cat_missing_method must be one of 'fail' 'zero' or 'convert', "
-                f" got {cat_missing_method}"
+                "cat_missing_method must be one of 'fail' 'zero' or 'convert'; "
+                f" got {cat_missing_method}."
             )
+
         self._missing_method = cat_missing_method
         self._missing_category = cat_missing_name
 
+        if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)):
+            cat_vec = np.asanyarray(cat_vec)
+
         if isinstance(cat_vec, pd.Categorical):
-            self.cat = cat_vec
+            self.categories = cat_vec.categories.to_numpy()
+            indices = cat_vec.codes
+        elif isinstance(cat_vec.dtype, pd.CategoricalDtype):
+            self.categories = cat_vec.cat.categories.to_numpy()
+            indices = cat_vec.cat.codes.to_numpy()
+        elif isinstance(cat_vec, pl.Series):
+            if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)):
+                cat_vec = cat_vec.cast(pl.Categorical)
+            self.categories = cat_vec.cat.get_categories().to_numpy()
+            indices = cat_vec.to_physical().fill_null(-1).to_numpy()
         else:
-            self.cat = pd.Categorical(cat_vec)
+            indices, self.categories = pd.factorize(cat_vec, sort=True)
 
-        if pd.isnull(self.cat).any():
+        if np.any(indices == -1):
             if self._missing_method == "fail":
                 raise ValueError(
                     "Categorical data can't have missing values "
@@ -285,14 +301,17 @@ def __init__(
                 )
 
             elif self._missing_method == "convert":
-                if self._missing_category in self.cat.categories:
+                if self._missing_category in self.categories:
                     raise ValueError(
                         f"Missing category {self._missing_category} already exists."
                     )
 
-                self.cat = self.cat.add_categories([self._missing_category])
+                self.categories = np.hstack(
+                    [self.categories, self._missing_category], dtype="object"
+                )
+
+                indices = np.where(indices < 0, len(self.categories) - 1, indices)
 
-                self.cat[pd.isnull(self.cat)] = self._missing_category
                 self._has_missings = False
 
             else:
@@ -302,38 +321,50 @@ def __init__(
             self._has_missings = False
 
         self.drop_first = drop_first
-        self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first))
-        self.indices = self.cat.codes.astype(np.int32)
-        self.x_csc: Optional[tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
+        self.indices = indices.astype(np.int32)
+        self.shape = (len(self.indices), len(self.categories) - int(drop_first))
+        self.x_csc = None
         self.dtype = np.dtype(dtype)
 
         self._colname = column_name
+        self._colname_format = column_name_format
+
         if term_name is None:
             self._term = self._colname
         else:
             self._term = term_name
-        self._colname_format = column_name_format
 
     __array_ufunc__ = None
 
+    @property
+    def cat(self):
+        """Return a pandas array with same data as what was initially fed to __init__.
+
+        This property is available for backward compatibility.
+        """
+        warnings.warn(
+            "This property will be removed in the next major release.",
+            category=DeprecationWarning,
+        )
+        return pd.Categorical.from_codes(self.indices, categories=self.categories)
+
     def recover_orig(self) -> np.ndarray:
         """
         Return 1d numpy array with same data as what was initially fed to __init__.
 
         Test: matrix/test_categorical_matrix::test_recover_orig
         """
-        orig = self.cat.categories[self.cat.codes].to_numpy()
+        orig = self.categories[self.indices]
 
         if self._has_missings:
             orig = orig.view(np.ma.MaskedArray)
-            orig.mask = self.cat.codes == -1
+            orig.mask = self.indices == -1
         elif (
             self._missing_method == "convert"
-            and self._missing_category in self.cat.categories
+            and self._missing_category in self.categories
         ):
             orig = orig.view(np.ma.MaskedArray)
-            missing_code = self.cat.categories.get_loc(self._missing_category)
-            orig.mask = self.cat.codes == missing_code
+            orig.mask = self.indices == len(self.categories) - 1
 
         return orig
 
@@ -529,8 +560,6 @@ def _cross_sandwich(
         R_cols: Optional[np.ndarray] = None,
     ) -> np.ndarray:
         """Perform a sandwich product: X.T @ diag(d) @ Y."""
-        from .dense_matrix import DenseMatrix
-
         if isinstance(other, DenseMatrix):
             return self._cross_dense(other._array, d, rows, L_cols, R_cols)
         if isinstance(other, SparseMatrix):
@@ -576,8 +605,6 @@ def tocsr(self) -> sps.csr_matrix:
 
     def to_sparse_matrix(self):
         """Return a tabmat.SparseMatrix representation."""
-        from .sparse_matrix import SparseMatrix
-
         return SparseMatrix(
             self.tocsr(),
             column_names=self.column_names,
@@ -594,7 +621,7 @@ def unpack(self):
 
     def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Return CategoricalMatrix cast to new type."""
-        self.dtype = dtype
+        self.dtype = np.dtype(dtype)
         return self
 
     def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray:
@@ -613,7 +640,9 @@ def __getitem__(self, item):
             if isinstance(row, np.ndarray):
                 row = row.ravel()
             return CategoricalMatrix(
-                self.cat[row],
+                pd.Categorical.from_codes(
+                    self.indices[row], categories=self.categories
+                ),
                 drop_first=self.drop_first,
                 dtype=self.dtype,
                 column_name=self._colname,
@@ -745,7 +774,7 @@ def multiply(self, other) -> SparseMatrix:
         )
 
     def __repr__(self):
-        return str(self.cat)
+        return f"{self.__class__.__name__}\nCategories: {self.categories}"
 
     def get_names(
         self,
@@ -786,19 +815,19 @@ def get_names(
             raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
         if indices is None:
-            indices = list(range(len(self.cat.categories) - self.drop_first))
+            indices = list(range(len(self.categories) - self.drop_first))
         if name is None and missing_prefix is None:
-            return [None] * (len(self.cat.categories) - self.drop_first)
+            return [None] * (len(self.categories) - self.drop_first)
         elif name is None:
             name = f"{missing_prefix}{indices[0]}-{indices[-1]}"
 
         if type == "column":
             return [
                 self._colname_format.format(name=name, category=cat)
-                for cat in self.cat.categories[self.drop_first :]
+                for cat in self.categories[self.drop_first :]
             ]
         else:
-            return [name] * (len(self.cat.categories) - self.drop_first)
+            return [name] * (len(self.categories) - self.drop_first)
 
     def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"):
         """Set column names.
@@ -820,7 +849,7 @@ def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"
             if type == "column":
                 # Try finding the column name
                 base_names = []
-                for name, cat in zip(names, self.cat.categories[self.drop_first :]):
+                for name, cat in zip(names, self.categories[self.drop_first :]):
                     partial_name = self._colname_format.format(
                         name="__CAPTURE__", category=cat
                     )

From 207c548d8fdc44128777eabfcf3ae8192d166841 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 09:58:40 +0100
Subject: [PATCH 03/13] Constructor

---
 setup.py                   |   2 +-
 src/tabmat/__init__.py     |   3 +-
 src/tabmat/constructor.py  | 232 ++++++++++++++++++++++++++++++-------
 src/tabmat/dense_matrix.py |   4 +-
 4 files changed, 197 insertions(+), 44 deletions(-)

diff --git a/setup.py b/setup.py
index 6baffa5b..3792c27a 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
 print(f"Debug Build: {debug_build}")
 
 if sys.platform == "win32":
-    allocator_libs = []
+    allocator_libs = []  # type: ignore
     extra_compile_args = ["/openmp", "/O2"]
     extra_link_args = ["/openmp"]
     # make sure we can find xsimd headers
diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py
index bd7ce292..b88fa82f 100644
--- a/src/tabmat/__init__.py
+++ b/src/tabmat/__init__.py
@@ -1,7 +1,7 @@
 import importlib.metadata
 
 from .categorical_matrix import CategoricalMatrix
-from .constructor import from_csc, from_formula, from_pandas
+from .constructor import from_csc, from_formula, from_pandas, from_polars
 from .dense_matrix import DenseMatrix
 from .matrix_base import MatrixBase
 from .sparse_matrix import SparseMatrix
@@ -23,6 +23,7 @@
     "from_csc",
     "from_formula",
     "from_pandas",
+    "from_polars",
     "as_tabmat",
     "hstack",
 ]
diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index ae7fff5e..a337a67e 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -5,11 +5,12 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 from formulaic import Formula, ModelSpec
 from formulaic.materializers.types import NAAction
 from formulaic.parser import DefaultFormulaParser
 from formulaic.utils.layered_mapping import LayeredMapping
-from pandas.api.types import is_bool_dtype, is_numeric_dtype
+from pandas.api.types import is_numeric_dtype
 from scipy import sparse as sps
 
 from .categorical_matrix import CategoricalMatrix
@@ -78,16 +79,15 @@ def from_pandas(
     indices: list[list[int]] = []
     is_cat: list[bool] = []
 
-    dense_dfidx = []  # column index in original DataFrame
-    dense_mxidx = []  # index in the new SplitMatrix
-    sparse_dfcols = []  # sparse columns to join together
-    sparse_mxidx = []  # index in the new SplitMatrix
+    dense_columns = []  # column index in original DataFrame
+    dense_indices = []  # index in the new SplitMatrix
+    sparse_columns = []  # sparse columns to join together
+    sparse_indices = []  # index in the new SplitMatrix
     ignored_cols = []
 
     mxcolidx = 0
 
-    for dfcolidx, (colname, coldata) in enumerate(df.items()):
-        # categorical
+    for colname, coldata in df.items():
         if object_as_cat and coldata.dtype == object:
             coldata = coldata.astype("category")
         if isinstance(coldata.dtype, pd.CategoricalDtype):
@@ -101,12 +101,12 @@ def from_pandas(
                 cat_missing_method=cat_missing_method,
                 cat_missing_name=cat_missing_name,
             )
-            if len(coldata.cat.categories) < cat_threshold:
+            if len(cat.categories) < cat_threshold:
                 (
                     X_dense_F,
                     X_sparse,
-                    dense_indices,
-                    sparse_indices,
+                    dense_idx,
+                    sparse_idx,
                 ) = _split_sparse_and_dense_parts(
                     sps.csc_matrix(cat.tocsr(), dtype=dtype),
                     threshold=sparse_threshold,
@@ -118,12 +118,12 @@ def from_pandas(
                 matrices.append(X_sparse)
                 is_cat.append(True)
                 if cat_position == "expand":
-                    indices.append(mxcolidx + dense_indices)
-                    indices.append(mxcolidx + sparse_indices)
-                    mxcolidx += len(dense_indices) + len(sparse_indices)
+                    indices.append(mxcolidx + dense_idx)
+                    indices.append(mxcolidx + sparse_idx)
+                    mxcolidx += len(dense_idx) + len(sparse_idx)
                 elif cat_position == "end":
-                    indices.append(dense_indices)
-                    indices.append(sparse_indices)
+                    indices.append(dense_idx)
+                    indices.append(sparse_idx)
 
             else:
                 matrices.append(cat)
@@ -133,53 +133,205 @@ def from_pandas(
                     mxcolidx += cat.shape[1]
                 elif cat_position == "end":
                     indices.append(np.arange(cat.shape[1]))
-        # All other numerical dtypes (needs to be after pd.SparseDtype)
         elif is_numeric_dtype(coldata):
-            # check if we want to store as sparse
             if (coldata != 0).mean() <= sparse_threshold:
-                if not isinstance(coldata.dtype, pd.SparseDtype):
-                    fill_value = False if is_bool_dtype(coldata) else 0  # type: ignore
-                    sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=fill_value)
-                    sparse_dfcols.append(coldata.astype(sparse_dtype))
-                else:
-                    sparse_dfcols.append(coldata)
-                sparse_mxidx.append(mxcolidx)
+                sparse_columns.append(colname)
+                sparse_indices.append(mxcolidx)
                 mxcolidx += 1
             else:
-                dense_dfidx.append(dfcolidx)
-                dense_mxidx.append(mxcolidx)
+                dense_columns.append(colname)
+                dense_indices.append(mxcolidx)
                 mxcolidx += 1
 
-        # dtype not handled yet
         else:
-            ignored_cols.append((dfcolidx, colname))
+            ignored_cols.append(colname)
 
     if len(ignored_cols) > 0:
         warnings.warn(
             f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
         )
-    if len(dense_dfidx) > 0:
+    if dense_columns:
         matrices.append(
             DenseMatrix(
-                df.iloc[:, dense_dfidx].astype(dtype),
-                column_names=df.columns[dense_dfidx],
-                term_names=df.columns[dense_dfidx],
+                df[dense_columns].astype(dtype),
+                column_names=dense_columns,
+                term_names=dense_columns,
             )
         )
-        indices.append(dense_mxidx)
+        indices.append(dense_indices)
         is_cat.append(False)
-    if len(sparse_dfcols) > 0:
-        sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
-        full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
+    if sparse_columns:
         matrices.append(
             SparseMatrix(
-                full_sparse,
+                sps.coo_matrix(df[sparse_columns], dtype=dtype),
                 dtype=dtype,
-                column_names=[col.name for col in sparse_dfcols],
-                term_names=[col.name for col in sparse_dfcols],
+                column_names=sparse_columns,
+                term_names=sparse_columns,
             )
         )
-        indices.append(sparse_mxidx)
+        indices.append(sparse_indices)
+        is_cat.append(False)
+
+    if cat_position == "end":
+        new_indices = []
+        for mat_indices, is_cat_ in zip(indices, is_cat):
+            if is_cat_:
+                new_indices.append(np.asarray(mat_indices) + mxcolidx)
+                mxcolidx += len(mat_indices)
+            else:
+                new_indices.append(mat_indices)
+        indices = new_indices
+
+    if len(matrices) > 1:
+        return SplitMatrix(matrices, indices)
+    elif len(matrices) == 0:
+        raise ValueError("DataFrame contained no valid column")
+    else:
+        return matrices[0]
+
+
+def from_polars(
+    df: pl.DataFrame,
+    dtype: np.dtype = np.float64,
+    sparse_threshold: float = 0.1,
+    cat_threshold: int = 4,
+    cat_position: str = "expand",
+    drop_first: bool = False,
+    categorical_format: str = "{name}[{category}]",
+    cat_missing_method: str = "fail",
+    cat_missing_name: str = "(MISSING)",
+) -> MatrixBase:
+    """
+    Transform a polars.DataFrame into an efficient SplitMatrix. For most users, this
+    will be the primary way to construct tabmat objects from their data.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        Polars DataFrame to be converted.
+    dtype : np.dtype, default np.float64
+        dtype of all sub-matrices of the resulting SplitMatrix.
+    sparse_threshold : float, default 0.1
+        Density threshold below which numerical columns will be stored in a sparse
+        format.
+    cat_threshold : int, default 4
+        Number of levels of a categorical column under which the column will be stored
+        as sparse one-hot-encoded columns instead of CategoricalMatrix
+    cat_position : str {'end'|'expand'}, default 'expand'
+        Position of the categorical variable in the index. If "last", all the
+        categoricals (including the ones that did not satisfy cat_threshold)
+        will be placed at the end of the index list. If "expand", all the variables
+        will remain in the same order.
+    drop_first : bool, default False
+        If true, categoricals variables will have their first category dropped.
+        This allows multiple categorical variables to be included in an
+        unregularized model. If False, all categories are included.
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
+        How to handle missing values in categorical columns:
+        - if 'fail', raise an error if there are missing values.
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the '(MISSING)' category.
+    cat_missing_name: str, default '(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.
+
+    Returns
+    -------
+    SplitMatrix
+    """
+    matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
+    indices: list[list[int]] = []
+    is_cat: list[bool] = []
+
+    dense_columns = []  # column index in original DataFrame
+    dense_indices = []  # index in the new SplitMatrix
+    sparse_columns = []  # sparse columns to join together
+    sparse_indices = []  # index in the new SplitMatrix
+    ignored_cols = []
+
+    mxcolidx = 0
+
+    for coldata in df.iter_columns():
+        if isinstance(coldata.dtype, (pl.Categorical, pl.Enum)):
+            cat = CategoricalMatrix(
+                coldata,
+                drop_first=drop_first,
+                dtype=dtype,
+                column_name=coldata.name,
+                term_name=coldata.name,
+                column_name_format=categorical_format,
+                cat_missing_method=cat_missing_method,
+                cat_missing_name=cat_missing_name,
+            )
+            if len(cat.categories) < cat_threshold:
+                (
+                    X_dense_F,
+                    X_sparse,
+                    dense_idx,
+                    sparse_idx,
+                ) = _split_sparse_and_dense_parts(
+                    sps.csc_matrix(cat.tocsr(), dtype=dtype),
+                    threshold=sparse_threshold,
+                    column_names=cat.get_names("column"),
+                    term_names=cat.get_names("term"),
+                )
+                matrices.append(X_dense_F)
+                is_cat.append(True)
+                matrices.append(X_sparse)
+                is_cat.append(True)
+                if cat_position == "expand":
+                    indices.append(mxcolidx + dense_idx)
+                    indices.append(mxcolidx + sparse_idx)
+                    mxcolidx += len(dense_idx) + len(sparse_idx)
+                elif cat_position == "end":
+                    indices.append(dense_idx)
+                    indices.append(sparse_idx)
+
+            else:
+                matrices.append(cat)
+                is_cat.append(True)
+                if cat_position == "expand":
+                    indices.append(mxcolidx + np.arange(cat.shape[1]))
+                    mxcolidx += cat.shape[1]
+                elif cat_position == "end":
+                    indices.append(np.arange(cat.shape[1]))
+        elif coldata.dtype.is_numeric():
+            if (coldata != 0).mean() <= sparse_threshold:
+                sparse_columns.append(coldata.name)
+                sparse_indices.append(mxcolidx)
+                mxcolidx += 1
+            else:
+                dense_columns.append(coldata.name)
+                dense_indices.append(mxcolidx)
+                mxcolidx += 1
+
+        else:
+            ignored_cols.append(coldata.name)
+
+    if len(ignored_cols) > 0:
+        warnings.warn(
+            f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
+        )
+    if dense_columns:
+        matrices.append(
+            DenseMatrix(
+                df[dense_columns].to_numpy().astype(dtype),
+                column_names=dense_columns,
+                term_names=dense_columns,
+            )
+        )
+        indices.append(dense_indices)
+        is_cat.append(False)
+    if sparse_columns:
+        matrices.append(
+            SparseMatrix(
+                sps.coo_matrix(df[sparse_columns], dtype=dtype),
+                dtype=dtype,
+                column_names=sparse_columns,
+                term_names=sparse_columns,
+            )
+        )
+        indices.append(sparse_indices)
         is_cat.append(False)
 
     if cat_position == "end":
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 5dc6a156..0dd27845 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -42,7 +42,7 @@ def __init__(self, input_array, column_names=None, term_names=None):
         elif input_array.ndim > 2:
             raise ValueError("Input array must be 1- or 2-dimensional")
 
-        self._array = np.asarray(input_array)
+        self._array = input_array
         width = self._array.shape[1]
 
         if column_names is not None:
@@ -102,7 +102,7 @@ def ndim(self):
 
     @property
     def dtype(self):
-        """Data-type of the array’s elements."""  # noqa: D401
+        """Data type of the array's elements."""  # noqa: D401
         return self._array.dtype
 
     def transpose(self):

From 7be43d02dbe6c890e7d5b007fa1947c0c927d1fd Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 09:59:36 +0100
Subject: [PATCH 04/13] Tests

---
 tests/test_categorical_matrix.py |   5 +-
 tests/test_constructor.py        | 249 +++++++++++++++++++++++++++++++
 tests/test_fast_sandwich.py      |   4 +-
 tests/test_formula.py            |  10 +-
 tests/test_matrices.py           | 106 -------------
 tests/test_split_matrix.py       |  14 --
 6 files changed, 259 insertions(+), 129 deletions(-)
 create mode 100644 tests/test_constructor.py

diff --git a/tests/test_categorical_matrix.py b/tests/test_categorical_matrix.py
index 766576e1..5d74d676 100644
--- a/tests/test_categorical_matrix.py
+++ b/tests/test_categorical_matrix.py
@@ -18,11 +18,10 @@ def cat_vec(missing):
     return vec
 
 
-@pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32])
 @pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"])
 @pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"])
 @pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_recover_orig(cat_vec, vec_dtype, drop_first, missing, cat_missing_method):
+def test_recover_orig(cat_vec, drop_first, missing, cat_missing_method):
     if missing and cat_missing_method == "fail":
         with pytest.raises(
             ValueError, match="Categorical data can't have missing values"
@@ -173,7 +172,7 @@ def test_cat_missing_name(cat_missing_name):
         cat = CategoricalMatrix(
             vec, cat_missing_method="convert", cat_missing_name=cat_missing_name
         )
-        assert set(cat.cat.categories) == set(vec) - {None} | {cat_missing_name}
+        assert set(cat.categories) == set(vec) - {None} | {cat_missing_name}
 
 
 @pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"])
diff --git a/tests/test_constructor.py b/tests/test_constructor.py
new file mode 100644
index 00000000..0fc4b9fa
--- /dev/null
+++ b/tests/test_constructor.py
@@ -0,0 +1,249 @@
+import numpy as np
+import pandas as pd
+import polars as pl
+import pytest
+
+import tabmat as tm
+
+
+def test_pandas_to_matrix():
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    dense_column_with_lots_of_zeros = dense_column.copy()
+    dense_column_with_lots_of_zeros[:44] = 0.0
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows)
+
+    dense_ser = pd.Series(dense_column)
+    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
+    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
+    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
+    cat_ser_highdim = pd.Categorical(cat_column_highdim)
+
+    df = pd.DataFrame(
+        data={
+            "d": dense_ser,
+            "ds": lowdense_ser,
+            "s": sparse_ser,
+            "cl_obj": cat_ser_lowdim.astype(object),
+            "ch": cat_ser_highdim,
+        }
+    )
+
+    mat = tm.from_pandas(
+        df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True
+    )
+
+    assert mat.shape == (n_rows, n_rows + 5)
+    assert len(mat.matrices) == 3
+    assert isinstance(mat, tm.SplitMatrix)
+
+    nb_col_by_type = {
+        tm.DenseMatrix: 3,  # includes low-dimension categorical
+        tm.SparseMatrix: 2,  # sparse column
+        tm.CategoricalMatrix: n_rows,
+    }
+    for submat in mat.matrices:
+        assert submat.shape[1] == nb_col_by_type[type(submat)]
+
+    # Prevent a regression where the column type of sparsified dense columns
+    # was being changed in place.
+    assert df["cl_obj"].dtype == object
+    assert df["ds"].dtype == np.float64
+
+
+@pytest.mark.parametrize("categorical_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
+def test_polars_to_matrix(categorical_dtype):
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    dense_column_with_lots_of_zeros = dense_column.copy()
+    dense_column_with_lots_of_zeros[:44] = 0.0
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows).astype("str")
+
+    dense_ser = pl.Series(dense_column)
+    lowdense_ser = pl.Series(dense_column_with_lots_of_zeros)
+    sparse_ser = pl.Series(sparse_column)
+    cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=categorical_dtype)
+    cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical)
+
+    df = pl.DataFrame(
+        data={
+            "d": dense_ser,
+            "ds": lowdense_ser,
+            "s": sparse_ser,
+            "cl": cat_ser_lowdim,
+            "ch": cat_ser_highdim,
+        }
+    )
+
+    mat = tm.from_polars(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4)
+
+    assert mat.shape == (n_rows, n_rows + 5)
+    assert len(mat.matrices) == 3
+    assert isinstance(mat, tm.SplitMatrix)
+
+    nb_col_by_type = {
+        tm.DenseMatrix: 3,  # includes low-dimension categorical
+        tm.SparseMatrix: 2,  # sparse column
+        tm.CategoricalMatrix: n_rows,
+    }
+    for submat in mat.matrices:
+        assert submat.shape[1] == nb_col_by_type[type(submat)]
+
+    # Prevent a regression where the column type of sparsified dense columns
+    # was being changed in place.
+    assert df["cl"].dtype == categorical_dtype
+    assert df["ds"].dtype == pl.Float64
+
+
+@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
+def test_from_pandas_missing(cat_missing_method):
+    df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])})
+
+    if cat_missing_method == "fail":
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
+            tm.from_pandas(df, cat_missing_method=cat_missing_method)
+    elif cat_missing_method == "zero":
+        assert tm.from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2)
+    elif cat_missing_method == "convert":
+        assert tm.from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
+def test_from_polars_missing(cat_missing_method):
+    df = pl.DataFrame(
+        {"cat": pl.Series(["1", "2", None, "1", "2", None], dtype=pl.Categorical)}
+    )
+
+    if cat_missing_method == "fail":
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
+            tm.from_polars(df, cat_missing_method=cat_missing_method)
+    elif cat_missing_method == "zero":
+        assert tm.from_polars(df, cat_missing_method=cat_missing_method).shape == (6, 2)
+    elif cat_missing_method == "convert":
+        assert tm.from_polars(df, cat_missing_method=cat_missing_method).shape == (6, 3)
+
+
+@pytest.mark.parametrize("prefix_sep", ["_", ": "])
+@pytest.mark.parametrize("drop_first", [True, False])
+def test_names_pandas(prefix_sep, drop_first):
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    dense_column_with_lots_of_zeros = dense_column.copy()
+    dense_column_with_lots_of_zeros[:44] = 0.0
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows)
+
+    dense_ser = pd.Series(dense_column)
+    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
+    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
+    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
+    cat_ser_highdim = pd.Categorical(cat_column_highdim)
+
+    df = pd.DataFrame(
+        data={
+            "d": dense_ser,
+            "cl_obj": cat_ser_lowdim.astype(object),
+            "ch": cat_ser_highdim,
+            "ds": lowdense_ser,
+            "s": sparse_ser,
+        }
+    )
+
+    categorical_format = "{name}" + prefix_sep + "{category}"
+    mat_end = tm.from_pandas(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        object_as_cat=True,
+        cat_position="end",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first)
+    assert mat_end.column_names == expanded_df.columns.tolist()
+
+    mat_expand = tm.from_pandas(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        object_as_cat=True,
+        cat_position="expand",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    unique_terms = list(dict.fromkeys(mat_expand.term_names))
+    assert unique_terms == df.columns.tolist()
+
+
+@pytest.mark.parametrize("prefix_sep", ["_", ": "])
+@pytest.mark.parametrize("drop_first", [True, False])
+def test_names_polars(prefix_sep, drop_first):
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    dense_column_with_lots_of_zeros = dense_column.copy()
+    dense_column_with_lots_of_zeros[:44] = 0.0
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows).astype("str")
+
+    dense_ser = pl.Series(dense_column)
+    lowdense_ser = pl.Series(dense_column_with_lots_of_zeros)
+    sparse_ser = pl.Series(sparse_column)
+    cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=pl.Categorical)
+    cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical)
+
+    df = pl.DataFrame(
+        data={
+            "d": dense_ser,
+            "ds": lowdense_ser,
+            "s": sparse_ser,
+            "cl": cat_ser_lowdim,
+            "ch": cat_ser_highdim,
+        }
+    )
+
+    categorical_format = "{name}" + prefix_sep + "{category}"
+    mat_end = tm.from_polars(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        cat_position="end",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    expanded_df = pd.get_dummies(
+        df.to_pandas(), prefix_sep=prefix_sep, drop_first=drop_first
+    )
+    assert mat_end.column_names == list(expanded_df.columns)
+
+    mat_expand = tm.from_polars(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        cat_position="expand",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    unique_terms = list(dict.fromkeys(mat_expand.term_names))
+    assert unique_terms == list(df.columns)
diff --git a/tests/test_fast_sandwich.py b/tests/test_fast_sandwich.py
index c22047ef..3b564cd4 100644
--- a/tests/test_fast_sandwich.py
+++ b/tests/test_fast_sandwich.py
@@ -30,7 +30,7 @@ def test_fast_sandwich_sparse(dtype):
         np.testing.assert_allclose(true, out, atol=np.sqrt(np.finfo(dtype).eps))
 
 
-@pytest.mark.high_memory
+@pytest.mark.skip(reason="too heavy")
 def test_fast_sandwich_sparse_large():
     # note that 50000 * 50000 > 2^31 - 1, so this will segfault when we index
     # with 32 bit integers (see GH #160)
@@ -105,7 +105,7 @@ def simulate_matrix(nonzero_frac=0.05, shape=(100, 50), seed=0, dtype=np.float64
     return A
 
 
-@pytest.mark.high_memory
+@pytest.mark.skip(reason="too heavy")
 @pytest.mark.parametrize("order", ["C", "F"])
 def test_fast_sandwich_dense_large(order):
     # this will segfault when we index with 32 bit integers (see GH #270)
diff --git a/tests/test_formula.py b/tests/test_formula.py
index 6f558bec..5c5c5989 100644
--- a/tests/test_formula.py
+++ b/tests/test_formula.py
@@ -139,8 +139,8 @@ def test_retrieval():
                                 "cat_1[b]:cat_3[1]",
                                 "cat_1[c]:cat_3[1]",
                                 "cat_1[a]:cat_3[2]",
-                                "cat_1[c]:cat_3[2]",
                                 "cat_1[b]:cat_3[2]",
+                                "cat_1[c]:cat_3[2]",
                             ],
                         ),
                         drop_first=False,
@@ -161,7 +161,8 @@ def test_matrix_against_expectation(df, formula, expected):
         if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
             np.testing.assert_array_equal(res.toarray(), res.toarray())
         elif isinstance(res, tm.CategoricalMatrix):
-            assert (exp.cat == res.cat).all()
+            np.testing.assert_array_equal(exp.categories, res.categories)
+            np.testing.assert_array_equal(exp.indices, res.indices)
             assert exp.drop_first == res.drop_first
 
 
@@ -241,8 +242,8 @@ def test_matrix_against_expectation(df, formula, expected):
                                 "cat_1__b__x__cat_3__1",
                                 "cat_1__c__x__cat_3__1",
                                 "cat_1__a__x__cat_3__2",
-                                "cat_1__c__x__cat_3__2",
                                 "cat_1__b__x__cat_3__2",
+                                "cat_1__c__x__cat_3__2",
                             ],
                         ),
                         drop_first=False,
@@ -270,7 +271,8 @@ def test_matrix_against_expectation_qcl(df, formula, expected):
         if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
             np.testing.assert_array_equal(res.toarray(), res.toarray())
         elif isinstance(res, tm.CategoricalMatrix):
-            assert (exp.cat == res.cat).all()
+            np.testing.assert_array_equal(exp.categories, res.categories)
+            np.testing.assert_array_equal(exp.indices, res.indices)
             assert exp.drop_first == res.drop_first
 
 
diff --git a/tests/test_matrices.py b/tests/test_matrices.py
index 9d7c3686..4409ed1c 100644
--- a/tests/test_matrices.py
+++ b/tests/test_matrices.py
@@ -617,54 +617,6 @@ def test_indexing_ix_both(mat):
     np.testing.assert_array_equal(res, expected)
 
 
-def test_pandas_to_matrix():
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
-    dense_column_with_lots_of_zeros = dense_column.copy()
-    dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
-    sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows)
-
-    dense_ser = pd.Series(dense_column)
-    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
-    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
-    cat_ser_highdim = pd.Categorical(cat_column_highdim)
-
-    df = pd.DataFrame(
-        data={
-            "d": dense_ser,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-            "cl_obj": cat_ser_lowdim.astype(object),
-            "ch": cat_ser_highdim,
-        }
-    )
-
-    mat = tm.from_pandas(
-        df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True
-    )
-
-    assert mat.shape == (n_rows, n_rows + 5)
-    assert len(mat.matrices) == 3
-    assert isinstance(mat, tm.SplitMatrix)
-
-    nb_col_by_type = {
-        tm.DenseMatrix: 3,  # includes low-dimension categorical
-        tm.SparseMatrix: 2,  # sparse column
-        tm.CategoricalMatrix: n_rows,
-    }
-    for submat in mat.matrices:
-        assert submat.shape[1] == nb_col_by_type[type(submat)]
-
-    # Prevent a regression where the column type of sparsified dense columns
-    # was being changed in place.
-    assert df["cl_obj"].dtype == object
-    assert df["ds"].dtype == np.float64
-
-
 @pytest.mark.parametrize("mat", get_all_matrix_base_subclass_mats())
 def test_split_matrix_creation(mat):
     sm = tm.SplitMatrix(matrices=[mat, mat])
@@ -839,61 +791,3 @@ def test_combine_names(mat_1, mat_2):
 
     assert combined.column_names == mat_1.column_names + mat_2.column_names
     assert combined.term_names == mat_1.term_names + mat_2.term_names
-
-
-@pytest.mark.parametrize("prefix_sep", ["_", ": "])
-@pytest.mark.parametrize("drop_first", [True, False])
-def test_names_pandas(prefix_sep, drop_first):
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
-    dense_column_with_lots_of_zeros = dense_column.copy()
-    dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
-    sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows)
-
-    dense_ser = pd.Series(dense_column)
-    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
-    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
-    cat_ser_highdim = pd.Categorical(cat_column_highdim)
-
-    df = pd.DataFrame(
-        data={
-            "d": dense_ser,
-            "cl_obj": cat_ser_lowdim.astype(object),
-            "ch": cat_ser_highdim,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-        }
-    )
-
-    categorical_format = "{name}" + prefix_sep + "{category}"
-    mat_end = tm.from_pandas(
-        df,
-        dtype=np.float64,
-        sparse_threshold=0.3,
-        cat_threshold=4,
-        object_as_cat=True,
-        cat_position="end",
-        categorical_format=categorical_format,
-        drop_first=drop_first,
-    )
-
-    expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first)
-    assert mat_end.column_names == expanded_df.columns.tolist()
-
-    mat_expand = tm.from_pandas(
-        df,
-        dtype=np.float64,
-        sparse_threshold=0.3,
-        cat_threshold=4,
-        object_as_cat=True,
-        cat_position="expand",
-        categorical_format=categorical_format,
-        drop_first=drop_first,
-    )
-
-    unique_terms = list(dict.fromkeys(mat_expand.term_names))
-    assert unique_terms == df.columns.tolist()
diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py
index 7b1b3ed2..da8d2452 100644
--- a/tests/test_split_matrix.py
+++ b/tests/test_split_matrix.py
@@ -306,17 +306,3 @@ def test_matvec(n_rows):
     )
     mat = from_pandas(X, cat_threshold=0)
     np.testing.assert_allclose(mat.matvec(np.array(mat.shape[1] * [1])), n_cols)
-
-
-@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"])
-def test_from_pandas_missing(cat_missing_method):
-    df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])})
-    if cat_missing_method == "fail":
-        with pytest.raises(
-            ValueError, match="Categorical data can't have missing values"
-        ):
-            from_pandas(df, cat_missing_method=cat_missing_method)
-    elif cat_missing_method == "zero":
-        assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2)
-    elif cat_missing_method == "convert":
-        assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3)

From f80a9770d9a8d24be94e70064ceb6e3d7e9300c0 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 10:03:39 +0100
Subject: [PATCH 05/13] Change log

---
 CHANGELOG.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index d8d463fc..c94cb673 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,6 +10,10 @@ Changelog
 4.0.1 - 2024-06-25
 ------------------
 
+**New features:**
+
+- Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`.
+
 **Other changes:**
 
 - Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.

From fd872c182f750f30dbf6a75df338fc896c510dae Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 10:10:27 +0100
Subject: [PATCH 06/13] Patch

---
 CHANGELOG.rst       | 2 +-
 environment-win.yml | 1 +
 environment.yml     | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c94cb673..6c41748d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,7 +10,7 @@ Changelog
 4.0.1 - 2024-06-25
 ------------------
 
-**New features:**
+**New feature:**
 
 - Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`.
 
diff --git a/environment-win.yml b/environment-win.yml
index 712f0b46..82b88bd1 100644
--- a/environment-win.yml
+++ b/environment-win.yml
@@ -8,6 +8,7 @@ dependencies:
   - numpy
   - pandas
   - polars
+  - pyarrow  # exclusively for polars tests
   - scipy
 
   # development tools
diff --git a/environment.yml b/environment.yml
index c8184764..196c5460 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,8 +5,9 @@ channels:
 dependencies:
   - formulaic>=0.6
   - numpy
-  - polars
   - pandas
+  - polars
+  - pyarrow  # exclusively for polars tests
   - scipy
 
   # development tools

From f1728f3c28cb8b4aa4f06e5c23fab0711569d689 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Mon, 17 Jun 2024 13:56:37 +0100
Subject: [PATCH 07/13] Dependency

---
 CHANGELOG.rst | 2 +-
 setup.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 6c41748d..4befa7e5 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -17,7 +17,7 @@ Changelog
 **Other changes:**
 
 - Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
-- Add support between formulaic and pandas 3.0
+- Add support between formulaic and pandas 3.0.
 - Support pypi release for numpy 2.0
 
 4.0.0 - 2024-04-23
diff --git a/setup.py b/setup.py
index 3792c27a..35e34e0c 100644
--- a/setup.py
+++ b/setup.py
@@ -157,7 +157,7 @@
     ],
     package_dir={"": "src"},
     packages=find_packages(where="src"),
-    install_requires=["numpy", "pandas", "scipy", "formulaic>=0.6"],
+    install_requires=["numpy", "pandas", "polars", "scipy", "formulaic>=0.6"],
     python_requires=">=3.9",
     ext_modules=cythonize(
         ext_modules,

From 530b519b1ef68877156227a485829125850a6c4d Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Tue, 18 Jun 2024 12:22:19 +0100
Subject: [PATCH 08/13] Helpers

---
 src/tabmat/constructor.py | 80 +++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index a337a67e..cce46433 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -151,36 +151,16 @@ def from_pandas(
             f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
         )
     if dense_columns:
-        matrices.append(
-            DenseMatrix(
-                df[dense_columns].astype(dtype),
-                column_names=dense_columns,
-                term_names=dense_columns,
-            )
-        )
+        matrices.append(_dense_matrix(df, dense_columns, dtype))
         indices.append(dense_indices)
         is_cat.append(False)
     if sparse_columns:
-        matrices.append(
-            SparseMatrix(
-                sps.coo_matrix(df[sparse_columns], dtype=dtype),
-                dtype=dtype,
-                column_names=sparse_columns,
-                term_names=sparse_columns,
-            )
-        )
+        matrices.append(_sparse_matrix(df, sparse_columns, dtype))
         indices.append(sparse_indices)
         is_cat.append(False)
 
     if cat_position == "end":
-        new_indices = []
-        for mat_indices, is_cat_ in zip(indices, is_cat):
-            if is_cat_:
-                new_indices.append(np.asarray(mat_indices) + mxcolidx)
-                mxcolidx += len(mat_indices)
-            else:
-                new_indices.append(mat_indices)
-        indices = new_indices
+        indices = _reindex_cat(indices, is_cat, mxcolidx)
 
     if len(matrices) > 1:
         return SplitMatrix(matrices, indices)
@@ -313,36 +293,16 @@ def from_polars(
             f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
         )
     if dense_columns:
-        matrices.append(
-            DenseMatrix(
-                df[dense_columns].to_numpy().astype(dtype),
-                column_names=dense_columns,
-                term_names=dense_columns,
-            )
-        )
+        matrices.append(_dense_matrix(df, dense_columns, dtype))
         indices.append(dense_indices)
         is_cat.append(False)
     if sparse_columns:
-        matrices.append(
-            SparseMatrix(
-                sps.coo_matrix(df[sparse_columns], dtype=dtype),
-                dtype=dtype,
-                column_names=sparse_columns,
-                term_names=sparse_columns,
-            )
-        )
+        matrices.append(_sparse_matrix(df, sparse_columns, dtype))
         indices.append(sparse_indices)
         is_cat.append(False)
 
     if cat_position == "end":
-        new_indices = []
-        for mat_indices, is_cat_ in zip(indices, is_cat):
-            if is_cat_:
-                new_indices.append(np.asarray(mat_indices) + mxcolidx)
-                mxcolidx += len(mat_indices)
-            else:
-                new_indices.append(mat_indices)
-        indices = new_indices
+        indices = _reindex_cat(indices, is_cat, mxcolidx)
 
     if len(matrices) > 1:
         return SplitMatrix(matrices, indices)
@@ -352,6 +312,34 @@ def from_polars(
         return matrices[0]
 
 
+def _dense_matrix(df, dense_columns, dtype):
+    return DenseMatrix(
+        df[dense_columns].to_numpy().astype(dtype),
+        column_names=dense_columns,
+        term_names=dense_columns,
+    )
+
+
+def _reindex_cat(indices, is_cat, mxcolidx):
+    new_indices = []
+    for mat_indices, is_cat_ in zip(indices, is_cat):
+        if is_cat_:
+            new_indices.append(np.asarray(mat_indices) + mxcolidx)
+            mxcolidx = mxcolidx + len(mat_indices)
+        else:
+            new_indices.append(mat_indices)
+    return new_indices
+
+
+def _sparse_matrix(df, sparse_columns, dtype):
+    return SparseMatrix(
+        sps.coo_matrix(df[sparse_columns], dtype=dtype),
+        dtype=dtype,
+        column_names=sparse_columns,
+        term_names=sparse_columns,
+    )
+
+
 def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
     """
     Convert a CSC-format sparse matrix into a ``SplitMatrix``.

From 90e43cbdcb7b6d2d27fd624191a27e827e83c1cf Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Tue, 18 Jun 2024 12:37:34 +0100
Subject: [PATCH 09/13] Simplify tests

---
 tests/test_constructor.py | 147 ++++++++++++--------------------------
 1 file changed, 46 insertions(+), 101 deletions(-)

diff --git a/tests/test_constructor.py b/tests/test_constructor.py
index 0fc4b9fa..ea5c6c4c 100644
--- a/tests/test_constructor.py
+++ b/tests/test_constructor.py
@@ -5,93 +5,84 @@
 
 import tabmat as tm
 
+N_ROWS = 50
 
-def test_pandas_to_matrix():
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+
+def construct_data(backend):
+    dense_column = np.linspace(-10, 10, num=N_ROWS, dtype=np.float64)
     dense_column_with_lots_of_zeros = dense_column.copy()
     dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column = np.zeros(N_ROWS, dtype=np.float64)
     sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows)
-
-    dense_ser = pd.Series(dense_column)
-    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
-    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
-    cat_ser_highdim = pd.Categorical(cat_column_highdim)
-
-    df = pd.DataFrame(
-        data={
-            "d": dense_ser,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-            "cl_obj": cat_ser_lowdim.astype(object),
-            "ch": cat_ser_highdim,
-        }
-    )
+    cat_column_lowdim = np.tile(["a", "b"], N_ROWS // 2)
+    cat_column_highdim = np.arange(N_ROWS)
+
+    data = {
+        "d": dense_column,
+        "ds": dense_column_with_lots_of_zeros,
+        "s": sparse_column,
+        "cl": cat_column_lowdim,
+        "ch": cat_column_highdim,
+    }
+
+    if backend == "pandas":
+        data["s"] = pd.Series(data["s"], dtype=pd.SparseDtype("float", 0.0))
+        data["cl"] = cat_column_lowdim.astype("object")
+        data["ch"] = pd.Categorical(cat_column_highdim)
+
+        return pd.DataFrame(data)
+
+    if backend == "polars":
+        data["cl"] = pl.Series(cat_column_lowdim, dtype=pl.Categorical)
+        data["ch"] = pl.Series(cat_column_highdim.astype("str"), dtype=pl.Categorical)
+
+        return pl.DataFrame(data)
+
+    raise ValueError
+
+
+def test_pandas_to_matrix():
+    df = construct_data("pandas")
 
     mat = tm.from_pandas(
         df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4, object_as_cat=True
     )
 
-    assert mat.shape == (n_rows, n_rows + 5)
+    assert mat.shape == (N_ROWS, N_ROWS + 5)
     assert len(mat.matrices) == 3
     assert isinstance(mat, tm.SplitMatrix)
 
     nb_col_by_type = {
         tm.DenseMatrix: 3,  # includes low-dimension categorical
         tm.SparseMatrix: 2,  # sparse column
-        tm.CategoricalMatrix: n_rows,
+        tm.CategoricalMatrix: N_ROWS,
     }
+
     for submat in mat.matrices:
         assert submat.shape[1] == nb_col_by_type[type(submat)]
 
     # Prevent a regression where the column type of sparsified dense columns
     # was being changed in place.
-    assert df["cl_obj"].dtype == object
+    assert df["cl"].dtype == object
     assert df["ds"].dtype == np.float64
 
 
 @pytest.mark.parametrize("categorical_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
 def test_polars_to_matrix(categorical_dtype):
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
-    dense_column_with_lots_of_zeros = dense_column.copy()
-    dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
-    sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows).astype("str")
-
-    dense_ser = pl.Series(dense_column)
-    lowdense_ser = pl.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pl.Series(sparse_column)
-    cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=categorical_dtype)
-    cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical)
-
-    df = pl.DataFrame(
-        data={
-            "d": dense_ser,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-            "cl": cat_ser_lowdim,
-            "ch": cat_ser_highdim,
-        }
-    )
+    df = construct_data("polars").with_columns(cl=pl.col("cl").cast(categorical_dtype))
 
     mat = tm.from_polars(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4)
 
-    assert mat.shape == (n_rows, n_rows + 5)
+    assert mat.shape == (N_ROWS, N_ROWS + 5)
     assert len(mat.matrices) == 3
     assert isinstance(mat, tm.SplitMatrix)
 
     nb_col_by_type = {
         tm.DenseMatrix: 3,  # includes low-dimension categorical
         tm.SparseMatrix: 2,  # sparse column
-        tm.CategoricalMatrix: n_rows,
+        tm.CategoricalMatrix: N_ROWS,
     }
+
     for submat in mat.matrices:
         assert submat.shape[1] == nb_col_by_type[type(submat)]
 
@@ -136,32 +127,9 @@ def test_from_polars_missing(cat_missing_method):
 @pytest.mark.parametrize("prefix_sep", ["_", ": "])
 @pytest.mark.parametrize("drop_first", [True, False])
 def test_names_pandas(prefix_sep, drop_first):
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
-    dense_column_with_lots_of_zeros = dense_column.copy()
-    dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
-    sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows)
-
-    dense_ser = pd.Series(dense_column)
-    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
-    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
-    cat_ser_highdim = pd.Categorical(cat_column_highdim)
-
-    df = pd.DataFrame(
-        data={
-            "d": dense_ser,
-            "cl_obj": cat_ser_lowdim.astype(object),
-            "ch": cat_ser_highdim,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-        }
-    )
-
+    df = construct_data("pandas")
     categorical_format = "{name}" + prefix_sep + "{category}"
+
     mat_end = tm.from_pandas(
         df,
         dtype=np.float64,
@@ -194,32 +162,9 @@ def test_names_pandas(prefix_sep, drop_first):
 @pytest.mark.parametrize("prefix_sep", ["_", ": "])
 @pytest.mark.parametrize("drop_first", [True, False])
 def test_names_polars(prefix_sep, drop_first):
-    n_rows = 50
-    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
-    dense_column_with_lots_of_zeros = dense_column.copy()
-    dense_column_with_lots_of_zeros[:44] = 0.0
-    sparse_column = np.zeros(n_rows, dtype=np.float64)
-    sparse_column[0] = 1.0
-    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
-    cat_column_highdim = np.arange(n_rows).astype("str")
-
-    dense_ser = pl.Series(dense_column)
-    lowdense_ser = pl.Series(dense_column_with_lots_of_zeros)
-    sparse_ser = pl.Series(sparse_column)
-    cat_ser_lowdim = pl.Series(cat_column_lowdim, dtype=pl.Categorical)
-    cat_ser_highdim = pl.Series(cat_column_highdim, dtype=pl.Categorical)
-
-    df = pl.DataFrame(
-        data={
-            "d": dense_ser,
-            "ds": lowdense_ser,
-            "s": sparse_ser,
-            "cl": cat_ser_lowdim,
-            "ch": cat_ser_highdim,
-        }
-    )
-
+    df = construct_data("polars")
     categorical_format = "{name}" + prefix_sep + "{category}"
+
     mat_end = tm.from_polars(
         df,
         dtype=np.float64,

From 9100ddcea99263e77bf6444a205dc502a376fe9b Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Tue, 18 Jun 2024 14:58:21 +0100
Subject: [PATCH 10/13] It's all optional

---
 conda.recipe/meta.yaml           |  2 --
 setup.py                         |  2 +-
 src/tabmat/categorical_matrix.py | 56 ++++++++++++++++++++++++++------
 src/tabmat/constructor.py        | 15 +++++----
 4 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 3653bfd0..7636b11b 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -37,8 +37,6 @@ requirements:
     - python
     - {{ pin_compatible('numpy') }}
     - formulaic>=0.6
-    - pandas
-    - polars
     - scipy
 
 test:
diff --git a/setup.py b/setup.py
index 35e34e0c..324d5f70 100644
--- a/setup.py
+++ b/setup.py
@@ -157,7 +157,7 @@
     ],
     package_dir={"": "src"},
     packages=find_packages(where="src"),
-    install_requires=["numpy", "pandas", "polars", "scipy", "formulaic>=0.6"],
+    install_requires=["formulaic>=0.6", "numpy", "pandas", "polars", "scipy"],
     python_requires=">=3.9",
     ext_modules=cythonize(
         ext_modules,
diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 83e62b06..cb149de7 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -162,6 +162,7 @@ def matvec(mat, vec):
 
 """
 
+import importlib.util
 import re
 import warnings
 from typing import Optional, Union
@@ -194,6 +195,20 @@ def matvec(mat, vec):
     setup_restrictions,
 )
 
+if importlib.util.find_spec("pandas"):
+    import pandas as pd
+if importlib.util.find_spec("polars"):
+    import polars as pl
+
+
+class _Categorical:
+    """This class helps us avoid copies while subsetting."""
+
+    def __init__(self, indices, categories, input_type):
+        self.indices = indices
+        self.categories = categories
+        self.input_type = input_type
+
 
 def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]):
     if isinstance(indexer, np.ndarray):
@@ -206,6 +221,18 @@ def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray])
         return len(range(*indexer.indices(full_length))) == full_length
 
 
+def _is_pandas(x) -> bool:
+    if importlib.util.find_spec("pandas"):
+        return isinstance(x, (pd.Categorical, pd.CategoricalDtype))
+    return False
+
+
+def _is_polars(x) -> bool:
+    if importlib.util.find_spec("polars"):
+        return isinstance(x, (pl.Series, pl.Categorical, pl.Enum))
+    return False
+
+
 def _row_col_indexing(
     arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray]
 ) -> np.ndarray:
@@ -258,7 +285,7 @@ class CategoricalMatrix(MatrixBase):
 
     def __init__(
         self,
-        cat_vec: Union[list, np.ndarray, pd.Categorical, pd.Series, pl.Series],
+        cat_vec,
         drop_first: bool = False,
         dtype: np.dtype = np.float64,
         column_name: Optional[str] = None,
@@ -273,20 +300,25 @@ def __init__(
                 f" got {cat_missing_method}."
             )
 
+        self._input_type = cat_vec.dtype
         self._missing_method = cat_missing_method
         self._missing_category = cat_missing_name
 
         if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)):
             cat_vec = np.asanyarray(cat_vec)
 
-        if isinstance(cat_vec, pd.Categorical):
+        if isinstance(cat_vec, _Categorical):
+            indices = cat_vec.indices
+            self.categories = cat_vec.categories
+            self._input_type = cat_vec.input_type
+        elif _is_pandas(cat_vec):
             self.categories = cat_vec.categories.to_numpy()
             indices = cat_vec.codes
-        elif isinstance(cat_vec.dtype, pd.CategoricalDtype):
+        elif _is_pandas(cat_vec.dtype):
             self.categories = cat_vec.cat.categories.to_numpy()
             indices = cat_vec.cat.codes.to_numpy()
-        elif isinstance(cat_vec, pl.Series):
-            if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)):
+        elif _is_polars(cat_vec):
+            if not _is_polars(cat_vec.dtype):
                 cat_vec = cat_vec.cast(pl.Categorical)
             self.categories = cat_vec.cat.get_categories().to_numpy()
             indices = cat_vec.to_physical().fill_null(-1).to_numpy()
@@ -321,7 +353,7 @@ def __init__(
             self._has_missings = False
 
         self.drop_first = drop_first
-        self.indices = indices.astype(np.int32)
+        self.indices = indices.astype(np.int32, copy=False)
         self.shape = (len(self.indices), len(self.categories) - int(drop_first))
         self.x_csc = None
         self.dtype = np.dtype(dtype)
@@ -338,7 +370,7 @@ def __init__(
 
     @property
     def cat(self):
-        """Return a pandas array with same data as what was initially fed to __init__.
+        """Return a series with same data as what was initially fed to __init__.
 
         This property is available for backward compatibility.
         """
@@ -346,6 +378,12 @@ def cat(self):
             "This property will be removed in the next major release.",
             category=DeprecationWarning,
         )
+
+        if _is_polars(self._input_type):
+            out = self.categories[self.indices].astype("object", copy=False)
+            out = np.where(self.indices < 0, None, out)
+            return pl.Series(out, dtype=pl.Enum(self.categories))
+
         return pd.Categorical.from_codes(self.indices, categories=self.categories)
 
     def recover_orig(self) -> np.ndarray:
@@ -640,9 +678,7 @@ def __getitem__(self, item):
             if isinstance(row, np.ndarray):
                 row = row.ravel()
             return CategoricalMatrix(
-                pd.Categorical.from_codes(
-                    self.indices[row], categories=self.categories
-                ),
+                _Categorical(self.indices[row], self.categories, self._input_type),
                 drop_first=self.drop_first,
                 dtype=self.dtype,
                 column_name=self._colname,
diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index cce46433..7011e181 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -4,13 +4,10 @@
 from typing import Any, Optional, Union
 
 import numpy as np
-import pandas as pd
-import polars as pl
 from formulaic import Formula, ModelSpec
 from formulaic.materializers.types import NAAction
 from formulaic.parser import DefaultFormulaParser
 from formulaic.utils.layered_mapping import LayeredMapping
-from pandas.api.types import is_numeric_dtype
 from scipy import sparse as sps
 
 from .categorical_matrix import CategoricalMatrix
@@ -23,7 +20,7 @@
 
 
 def from_pandas(
-    df: pd.DataFrame,
+    df,
     dtype: np.dtype = np.float64,
     sparse_threshold: float = 0.1,
     cat_threshold: int = 4,
@@ -75,6 +72,8 @@ def from_pandas(
     -------
     SplitMatrix
     """
+    import pandas as pd
+
     matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
     indices: list[list[int]] = []
     is_cat: list[bool] = []
@@ -133,7 +132,7 @@ def from_pandas(
                     mxcolidx += cat.shape[1]
                 elif cat_position == "end":
                     indices.append(np.arange(cat.shape[1]))
-        elif is_numeric_dtype(coldata):
+        elif pd.api.types.is_numeric_dtype(coldata):
             if (coldata != 0).mean() <= sparse_threshold:
                 sparse_columns.append(colname)
                 sparse_indices.append(mxcolidx)
@@ -171,7 +170,7 @@ def from_pandas(
 
 
 def from_polars(
-    df: pl.DataFrame,
+    df,
     dtype: np.dtype = np.float64,
     sparse_threshold: float = 0.1,
     cat_threshold: int = 4,
@@ -219,6 +218,8 @@ def from_polars(
     -------
     SplitMatrix
     """
+    import polars as pl
+
     matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
     indices: list[list[int]] = []
     is_cat: list[bool] = []
@@ -353,7 +354,7 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=N
 
 def from_formula(
     formula: Union[str, Formula],
-    data: pd.DataFrame,
+    data,
     ensure_full_rank: bool = False,
     na_action: Union[str, NAAction] = NAAction.IGNORE,
     dtype: np.dtype = np.float64,

From 7b2129603617e06844275d2575731b2d18d6efa2 Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Tue, 18 Jun 2024 15:07:43 +0100
Subject: [PATCH 11/13] Patch

---
 setup.py                         |  2 +-
 src/tabmat/categorical_matrix.py | 22 ++++++++++------------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/setup.py b/setup.py
index 324d5f70..10a0a807 100644
--- a/setup.py
+++ b/setup.py
@@ -157,7 +157,7 @@
     ],
     package_dir={"": "src"},
     packages=find_packages(where="src"),
-    install_requires=["formulaic>=0.6", "numpy", "pandas", "polars", "scipy"],
+    install_requires=["formulaic>=0.6", "numpy", "scipy"],
     python_requires=">=3.9",
     ext_modules=cythonize(
         ext_modules,
diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index cb149de7..684704e9 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -168,8 +168,6 @@ def matvec(mat, vec):
 from typing import Optional, Union
 
 import numpy as np
-import pandas as pd
-import polars as pl
 from scipy import sparse as sps
 
 from .dense_matrix import DenseMatrix
@@ -204,10 +202,10 @@ def matvec(mat, vec):
 class _Categorical:
     """This class helps us avoid copies while subsetting."""
 
-    def __init__(self, indices, categories, input_type):
+    def __init__(self, indices, categories, dtype):
         self.indices = indices
         self.categories = categories
-        self.input_type = input_type
+        self.dtype = dtype
 
 
 def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]):
@@ -294,23 +292,23 @@ def __init__(
         cat_missing_method: str = "fail",
         cat_missing_name: str = "(MISSING)",
     ):
-        if cat_missing_method not in ["fail", "zero", "convert"]:
+        if cat_missing_method not in {"fail", "zero", "convert"}:
             raise ValueError(
                 "cat_missing_method must be one of 'fail' 'zero' or 'convert'; "
                 f" got {cat_missing_method}."
             )
 
-        self._input_type = cat_vec.dtype
+        if not hasattr(cat_vec, "dtype"):
+            cat_vec = np.array(cat_vec)  # avoid errors in pd.factorize
+
+        self._input_dtype = cat_vec.dtype
         self._missing_method = cat_missing_method
         self._missing_category = cat_missing_name
 
-        if not isinstance(cat_vec, (pd.Categorical, pl.Series, pd.Series)):
-            cat_vec = np.asanyarray(cat_vec)
-
         if isinstance(cat_vec, _Categorical):
             indices = cat_vec.indices
             self.categories = cat_vec.categories
-            self._input_type = cat_vec.input_type
+            self._input_dtype = cat_vec.dtype
         elif _is_pandas(cat_vec):
             self.categories = cat_vec.categories.to_numpy()
             indices = cat_vec.codes
@@ -379,7 +377,7 @@ def cat(self):
             category=DeprecationWarning,
         )
 
-        if _is_polars(self._input_type):
+        if _is_polars(self._input_dtype):
             out = self.categories[self.indices].astype("object", copy=False)
             out = np.where(self.indices < 0, None, out)
             return pl.Series(out, dtype=pl.Enum(self.categories))
@@ -678,7 +676,7 @@ def __getitem__(self, item):
             if isinstance(row, np.ndarray):
                 row = row.ravel()
             return CategoricalMatrix(
-                _Categorical(self.indices[row], self.categories, self._input_type),
+                _Categorical(self.indices[row], self.categories, self._input_dtype),
                 drop_first=self.drop_first,
                 dtype=self.dtype,
                 column_name=self._colname,

From e2d059e7966924af6a4f69de349385cd1033072d Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Wed, 19 Jun 2024 15:29:30 +0100
Subject: [PATCH 12/13] Docstrings [skip ci]

---
 src/tabmat/constructor.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index 7011e181..c7e82f0e 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -32,13 +32,12 @@ def from_pandas(
     cat_missing_name: str = "(MISSING)",
 ) -> MatrixBase:
     """
-    Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
-    will be the primary way to construct tabmat objects from their data.
+    Transform a pandas.DataFrame into an efficient SplitMatrix.
 
     Parameters
     ----------
     df : pd.DataFrame
-        pandas DataFrame to be converted.
+        pandas DataFrame to convert.
     dtype : np.dtype, default np.float64
         dtype of all sub-matrices of the resulting SplitMatrix.
     sparse_threshold : float, default 0.1
@@ -181,13 +180,12 @@ def from_polars(
     cat_missing_name: str = "(MISSING)",
 ) -> MatrixBase:
     """
-    Transform a polars.DataFrame into an efficient SplitMatrix. For most users, this
-    will be the primary way to construct tabmat objects from their data.
+    Transform a polars.DataFrame into an efficient SplitMatrix.
 
     Parameters
     ----------
     df : pl.DataFrame
-        Polars DataFrame to be converted.
+        Polars DataFrame to convert.
     dtype : np.dtype, default np.float64
         dtype of all sub-matrices of the resulting SplitMatrix.
     sparse_threshold : float, default 0.1

From f83e918b60af44614933099da9b34ecf0b1c5bcc Mon Sep 17 00:00:00 2001
From: lbittarello <luca.bittarello@gmail.com>
Date: Tue, 25 Jun 2024 09:57:01 +0100
Subject: [PATCH 13/13] Helper function

---
 src/tabmat/categorical_matrix.py | 38 ++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 684704e9..33269439 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -231,6 +231,26 @@ def _is_polars(x) -> bool:
     return False
 
 
+def _extract_codes_and_categories(cat_vec):
+    if isinstance(cat_vec, _Categorical):
+        categories = cat_vec.categories
+        indices = cat_vec.indices
+    elif _is_pandas(cat_vec):
+        categories = cat_vec.categories.to_numpy()
+        indices = cat_vec.codes
+    elif _is_pandas(cat_vec.dtype):
+        categories = cat_vec.cat.categories.to_numpy()
+        indices = cat_vec.cat.codes.to_numpy()
+    elif _is_polars(cat_vec):
+        if not _is_polars(cat_vec.dtype):
+            cat_vec = cat_vec.cast(pl.Categorical)
+        categories = cat_vec.cat.get_categories().to_numpy()
+        indices = cat_vec.to_physical().fill_null(-1).to_numpy()
+    else:
+        indices, categories = pd.factorize(cat_vec, sort=True)
+    return indices, categories
+
+
 def _row_col_indexing(
     arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray]
 ) -> np.ndarray:
@@ -305,23 +325,7 @@ def __init__(
         self._missing_method = cat_missing_method
         self._missing_category = cat_missing_name
 
-        if isinstance(cat_vec, _Categorical):
-            indices = cat_vec.indices
-            self.categories = cat_vec.categories
-            self._input_dtype = cat_vec.dtype
-        elif _is_pandas(cat_vec):
-            self.categories = cat_vec.categories.to_numpy()
-            indices = cat_vec.codes
-        elif _is_pandas(cat_vec.dtype):
-            self.categories = cat_vec.cat.categories.to_numpy()
-            indices = cat_vec.cat.codes.to_numpy()
-        elif _is_polars(cat_vec):
-            if not _is_polars(cat_vec.dtype):
-                cat_vec = cat_vec.cast(pl.Categorical)
-            self.categories = cat_vec.cat.get_categories().to_numpy()
-            indices = cat_vec.to_physical().fill_null(-1).to_numpy()
-        else:
-            indices, self.categories = pd.factorize(cat_vec, sort=True)
+        indices, self.categories = _extract_codes_and_categories(cat_vec)
 
         if np.any(indices == -1):
             if self._missing_method == "fail":