Quantco · Luca Bittarello (lbittarello) · Jul 2, 2024 · Jun 14, 2024 · Jun 17, 2024 · Jun 17, 2024
@@ -10,10 +10,14 @@ Changelog
 4.0.1 - 2024-06-25
 ------------------
 
+**New feature:**
+
+- Added a new function, :func:`tabmat.from_polars`, to convert a :class:`polars.DataFrame` into a :class:`tabmat.SplitMatrix`.
+
 **Other changes:**
 
 - Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
-- Add support between formulaic and pandas 3.0
+- Add support between formulaic and pandas 3.0.
 - Support pypi release for numpy 2.0
 
 4.0.0 - 2024-04-23

@@ -36,9 +36,8 @@ requirements:
   run:
     - python
     - {{ pin_compatible('numpy') }}
-    - pandas
-    - scipy
     - formulaic>=0.6
+    - scipy
 
 test:
   requires:

@@ -4,8 +4,12 @@ channels:
   - nodefaults
 dependencies:
   - libblas>=0=*mkl
-  - pandas
   - formulaic>=0.6
+  - numpy
+  - pandas
+  - polars
+  - pyarrow  # exclusively for polars tests
+  - scipy
 
   # development tools
   - click

@@ -3,8 +3,12 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - pandas
   - formulaic>=0.6
+  - numpy
+  - pandas
+  - polars
+  - pyarrow  # exclusively for polars tests
+  - scipy
 
   # development tools
   - click

@@ -54,7 +54,7 @@
 print(f"Debug Build: {debug_build}")
 
 if sys.platform == "win32":
-    allocator_libs = []
+    allocator_libs = []  # type: ignore
     extra_compile_args = ["/openmp", "/O2"]
     extra_link_args = ["/openmp"]
     # make sure we can find xsimd headers
@@ -157,7 +157,7 @@
     ],
     package_dir={"": "src"},
     packages=find_packages(where="src"),
-    install_requires=["numpy", "pandas", "scipy", "formulaic>=0.6"],
+    install_requires=["formulaic>=0.6", "numpy", "scipy"],
     python_requires=">=3.9",
     ext_modules=cythonize(
         ext_modules,

@@ -1,7 +1,7 @@
 import importlib.metadata
 
 from .categorical_matrix import CategoricalMatrix
-from .constructor import from_csc, from_formula, from_pandas
+from .constructor import from_csc, from_formula, from_pandas, from_polars
 from .dense_matrix import DenseMatrix
 from .matrix_base import MatrixBase
 from .sparse_matrix import SparseMatrix
@@ -23,6 +23,7 @@
     "from_csc",
     "from_formula",
     "from_pandas",
+    "from_polars",
     "as_tabmat",
     "hstack",
 ]
@@ -162,13 +162,15 @@ def matvec(mat, vec):
 
 """
 
+import importlib.util
 import re
+import warnings
 from typing import Optional, Union
 
 import numpy as np
-import pandas as pd
 from scipy import sparse as sps
 
+from .dense_matrix import DenseMatrix
 from .ext.categorical import (
     matvec_complex,
     matvec_fast,
@@ -191,6 +193,20 @@ def matvec(mat, vec):
     setup_restrictions,
 )
 
+if importlib.util.find_spec("pandas"):
+    import pandas as pd
+if importlib.util.find_spec("polars"):
+    import polars as pl
+
+
+class _Categorical:
+    """This class helps us avoid copies while subsetting."""
+
+    def __init__(self, indices, categories, dtype):
+        self.indices = indices
+        self.categories = categories
+        self.dtype = dtype
+
 
 def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]):
     if isinstance(indexer, np.ndarray):
@@ -203,6 +219,38 @@ def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray])
         return len(range(*indexer.indices(full_length))) == full_length
 
 
+def _is_pandas(x) -> bool:
+    if importlib.util.find_spec("pandas"):
+        return isinstance(x, (pd.Categorical, pd.CategoricalDtype))
+    return False
+
+
+def _is_polars(x) -> bool:
+    if importlib.util.find_spec("polars"):
+        return isinstance(x, (pl.Series, pl.Categorical, pl.Enum))
+    return False
+
+
+def _extract_codes_and_categories(cat_vec):
+    if isinstance(cat_vec, _Categorical):
+        categories = cat_vec.categories
+        indices = cat_vec.indices
+    elif _is_pandas(cat_vec):
+        categories = cat_vec.categories.to_numpy()
+        indices = cat_vec.codes
+    elif _is_pandas(cat_vec.dtype):
+        categories = cat_vec.cat.categories.to_numpy()
+        indices = cat_vec.cat.codes.to_numpy()
+    elif _is_polars(cat_vec):
+        if not _is_polars(cat_vec.dtype):
+            cat_vec = cat_vec.cast(pl.Categorical)
+        categories = cat_vec.cat.get_categories().to_numpy()
+        indices = cat_vec.to_physical().fill_null(-1).to_numpy()
+    else:
+        indices, categories = pd.factorize(cat_vec, sort=True)
+    return indices, categories
+
+
 def _row_col_indexing(
     arr: np.ndarray, rows: Optional[np.ndarray], cols: Optional[np.ndarray]
 ) -> np.ndarray:
@@ -255,7 +303,7 @@ class CategoricalMatrix(MatrixBase):
 
     def __init__(
         self,
-        cat_vec: Union[list, np.ndarray, pd.Categorical],
+        cat_vec,
         drop_first: bool = False,
         dtype: np.dtype = np.float64,
         column_name: Optional[str] = None,
@@ -264,35 +312,40 @@ def __init__(
         cat_missing_method: str = "fail",
         cat_missing_name: str = "(MISSING)",
     ):
-        if cat_missing_method not in ["fail", "zero", "convert"]:
+        if cat_missing_method not in {"fail", "zero", "convert"}:
             raise ValueError(
-                "cat_missing_method must be one of 'fail' 'zero' or 'convert', "
-                f" got {cat_missing_method}"
+                "cat_missing_method must be one of 'fail' 'zero' or 'convert'; "
+                f" got {cat_missing_method}."
             )
+
+        if not hasattr(cat_vec, "dtype"):
+            cat_vec = np.array(cat_vec)  # avoid errors in pd.factorize
+
+        self._input_dtype = cat_vec.dtype
         self._missing_method = cat_missing_method
         self._missing_category = cat_missing_name
 
-        if isinstance(cat_vec, pd.Categorical):
-            self.cat = cat_vec
-        else:
-            self.cat = pd.Categorical(cat_vec)
+        indices, self.categories = _extract_codes_and_categories(cat_vec)
 
-        if pd.isnull(self.cat).any():
+        if np.any(indices == -1):
             if self._missing_method == "fail":
                 raise ValueError(
                     "Categorical data can't have missing values "
                     "if cat_missing_method='fail'."
                 )
 
             elif self._missing_method == "convert":
-                if self._missing_category in self.cat.categories:
+                if self._missing_category in self.categories:
                     raise ValueError(
                         f"Missing category {self._missing_category} already exists."
                     )
 
-                self.cat = self.cat.add_categories([self._missing_category])
+                self.categories = np.hstack(
+                    [self.categories, self._missing_category], dtype="object"
+                )
+
+                indices = np.where(indices < 0, len(self.categories) - 1, indices)
 
-                self.cat[pd.isnull(self.cat)] = self._missing_category
                 self._has_missings = False
 
             else:
@@ -302,38 +355,56 @@ def __init__(
             self._has_missings = False
 
         self.drop_first = drop_first
-        self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first))
-        self.indices = self.cat.codes.astype(np.int32)
-        self.x_csc: Optional[tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
+        self.indices = indices.astype(np.int32, copy=False)
+        self.shape = (len(self.indices), len(self.categories) - int(drop_first))
+        self.x_csc = None
         self.dtype = np.dtype(dtype)
 
         self._colname = column_name
+        self._colname_format = column_name_format
+
         if term_name is None:
             self._term = self._colname
         else:
             self._term = term_name
-        self._colname_format = column_name_format
 
     __array_ufunc__ = None
 
+    @property
+    def cat(self):
+        """Return a series with same data as what was initially fed to __init__.
+
+        This property is available for backward compatibility.
+        """
+        warnings.warn(
+            "This property will be removed in the next major release.",
+            category=DeprecationWarning,
+        )
+
+        if _is_polars(self._input_dtype):
+            out = self.categories[self.indices].astype("object", copy=False)
+            out = np.where(self.indices < 0, None, out)
+            return pl.Series(out, dtype=pl.Enum(self.categories))
+
+        return pd.Categorical.from_codes(self.indices, categories=self.categories)
+
     def recover_orig(self) -> np.ndarray:
         """
         Return 1d numpy array with same data as what was initially fed to __init__.
 
         Test: matrix/test_categorical_matrix::test_recover_orig
         """
-        orig = self.cat.categories[self.cat.codes].to_numpy()
+        orig = self.categories[self.indices]
 
         if self._has_missings:
             orig = orig.view(np.ma.MaskedArray)
-            orig.mask = self.cat.codes == -1
+            orig.mask = self.indices == -1
         elif (
             self._missing_method == "convert"
-            and self._missing_category in self.cat.categories
+            and self._missing_category in self.categories
         ):
             orig = orig.view(np.ma.MaskedArray)
-            missing_code = self.cat.categories.get_loc(self._missing_category)
-            orig.mask = self.cat.codes == missing_code
+            orig.mask = self.indices == len(self.categories) - 1
 
         return orig
 
@@ -529,8 +600,6 @@ def _cross_sandwich(
         R_cols: Optional[np.ndarray] = None,
     ) -> np.ndarray:
         """Perform a sandwich product: X.T @ diag(d) @ Y."""
-        from .dense_matrix import DenseMatrix
-
         if isinstance(other, DenseMatrix):
             return self._cross_dense(other._array, d, rows, L_cols, R_cols)
         if isinstance(other, SparseMatrix):
@@ -576,8 +645,6 @@ def tocsr(self) -> sps.csr_matrix:
 
     def to_sparse_matrix(self):
         """Return a tabmat.SparseMatrix representation."""
-        from .sparse_matrix import SparseMatrix
-
         return SparseMatrix(
             self.tocsr(),
             column_names=self.column_names,
@@ -594,7 +661,7 @@ def unpack(self):
 
     def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Return CategoricalMatrix cast to new type."""
-        self.dtype = dtype
+        self.dtype = np.dtype(dtype)
         return self
 
     def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray:
@@ -613,7 +680,7 @@ def __getitem__(self, item):
             if isinstance(row, np.ndarray):
                 row = row.ravel()
             return CategoricalMatrix(
-                self.cat[row],
+                _Categorical(self.indices[row], self.categories, self._input_dtype),
                 drop_first=self.drop_first,
                 dtype=self.dtype,
                 column_name=self._colname,
@@ -745,7 +812,7 @@ def multiply(self, other) -> SparseMatrix:
         )
 
     def __repr__(self):
-        return str(self.cat)
+        return f"{self.__class__.__name__}\nCategories: {self.categories}"
 
     def get_names(
         self,
@@ -786,19 +853,19 @@ def get_names(
             raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
         if indices is None:
-            indices = list(range(len(self.cat.categories) - self.drop_first))
+            indices = list(range(len(self.categories) - self.drop_first))
         if name is None and missing_prefix is None:
-            return [None] * (len(self.cat.categories) - self.drop_first)
+            return [None] * (len(self.categories) - self.drop_first)
         elif name is None:
             name = f"{missing_prefix}{indices[0]}-{indices[-1]}"
 
         if type == "column":
             return [
                 self._colname_format.format(name=name, category=cat)
-                for cat in self.cat.categories[self.drop_first :]
+                for cat in self.categories[self.drop_first :]
             ]
         else:
-            return [name] * (len(self.cat.categories) - self.drop_first)
+            return [name] * (len(self.categories) - self.drop_first)
 
     def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"):
         """Set column names.
@@ -820,7 +887,7 @@ def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"
             if type == "column":
                 # Try finding the column name
                 base_names = []
-                for name, cat in zip(names, self.cat.categories[self.drop_first :]):
+                for name, cat in zip(names, self.categories[self.drop_first :]):
                     partial_name = self._colname_format.format(
                         name="__CAPTURE__", category=cat
                     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,9 +36,8 @@ requirements: @@
       run:
         - python
         - {{ pin_compatible('numpy') }}
-        - pandas
-        - scipy
         - formulaic>=0.6
+        - scipy
     test:
       requires:
@@ Expand Down @@