Create SplitMatrix from polars data frame · Issue #329 · Quantco/tabmat

Currently, we have a handy from_pandas constructor

Lines 16 to 156 in 52e8583

    
           def from_pandas( 
        
               df: pd.DataFrame, 
        
               dtype: np.dtype = np.float64, 
        
               sparse_threshold: float = 0.1, 
        
               cat_threshold: int = 4, 
        
               object_as_cat: bool = False, 
        
               cat_position: str = "expand", 
        
               drop_first: bool = False, 
        
           ) -> MatrixBase: 
        
               """ 
        
               Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this 
        
               will be the primary way to construct tabmat objects from their data. 
        
               Parameters 
        
               ---------- 
        
               df : pd.DataFrame 
        
                   pandas DataFrame to be converted. 
        
               dtype : np.dtype, default np.float64 
        
                   dtype of all sub-matrices of the resulting SplitMatrix. 
        
               sparse_threshold : float, default 0.1 
        
                   Density threshold below which numerical columns will be stored in a sparse 
        
                   format. 
        
               cat_threshold : int, default 4 
        
                   Number of levels of a categorical column under which the column will be stored 
        
                   as sparse one-hot-encoded columns instead of CategoricalMatrix 
        
               object_as_cat : bool, default False 
        
                   If True, DataFrame columns stored as python objects will be treated as 
        
                   categorical columns. 
        
               cat_position : str {'end'|'expand'}, default 'expand' 
        
                   Position of the categorical variable in the index. If "last", all the 
        
                   categoricals (including the ones that did not satisfy cat_threshold) 
        
                   will be placed at the end of the index list. If "expand", all the variables 
        
                   will remain in the same order. 
        
               drop_first : bool, default False 
        
                   If true, categoricals variables will have their first category dropped. 
        
                   This allows multiple categorical variables to be included in an 
        
                   unregularized model. If False, all categories are included. 
        
               Returns 
        
               ------- 
        
               SplitMatrix 
        
               """ 
        
               matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] 
        
               indices: list[list[int]] = [] 
        
               is_cat: list[bool] = [] 
        
               dense_dfidx = []  # column index in original DataFrame 
        
               dense_mxidx = []  # index in the new SplitMatrix 
        
               sparse_dfcols = []  # sparse columns to join together 
        
               sparse_mxidx = []  # index in the new SplitMatrix 
        
               ignored_cols = [] 
        
               mxcolidx = 0 
        
               for dfcolidx, (colname, coldata) in enumerate(df.items()): 
        
                   # categorical 
        
                   if object_as_cat and coldata.dtype == object: 
        
                       coldata = coldata.astype("category") 
        
                   if isinstance(coldata.dtype, pd.CategoricalDtype): 
        
                       cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) 
        
                       if len(coldata.cat.categories) < cat_threshold: 
        
                           ( 
        
                               X_dense_F, 
        
                               X_sparse, 
        
                               dense_indices, 
        
                               sparse_indices, 
        
                           ) = _split_sparse_and_dense_parts( 
        
                               sps.csc_matrix(cat.tocsr(), dtype=dtype), 
        
                               threshold=sparse_threshold, 
        
                           ) 
        
                           matrices.append(X_dense_F) 
        
                           is_cat.append(True) 
        
                           matrices.append(X_sparse) 
        
                           is_cat.append(True) 
        
                           if cat_position == "expand": 
        
                               indices.append(mxcolidx + dense_indices) 
        
                               indices.append(mxcolidx + sparse_indices) 
        
                               mxcolidx += len(dense_indices) + len(sparse_indices) 
        
                           elif cat_position == "end": 
        
                               indices.append(dense_indices) 
        
                               indices.append(sparse_indices) 
        
                       else: 
        
                           matrices.append(cat) 
        
                           is_cat.append(True) 
        
                           if cat_position == "expand": 
        
                               indices.append(mxcolidx + np.arange(cat.shape[1])) 
        
                               mxcolidx += cat.shape[1] 
        
                           elif cat_position == "end": 
        
                               indices.append(np.arange(cat.shape[1])) 
        
                   # All other numerical dtypes (needs to be after pd.SparseDtype) 
        
                   elif is_numeric_dtype(coldata): 
        
                       # check if we want to store as sparse 
        
                       if (coldata != 0).mean() <= sparse_threshold: 
        
                           if not isinstance(coldata.dtype, pd.SparseDtype): 
        
                               sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0) 
        
                               sparse_dfcols.append(coldata.astype(sparse_dtype)) 
        
                           else: 
        
                               sparse_dfcols.append(coldata) 
        
                           sparse_mxidx.append(mxcolidx) 
        
                           mxcolidx += 1 
        
                       else: 
        
                           dense_dfidx.append(dfcolidx) 
        
                           dense_mxidx.append(mxcolidx) 
        
                           mxcolidx += 1 
        
                   # dtype not handled yet 
        
                   else: 
        
                       ignored_cols.append((dfcolidx, colname)) 
        
               if len(ignored_cols) > 0: 
        
                   warnings.warn( 
        
                       f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." 
        
                   ) 
        
               if len(dense_dfidx) > 0: 
        
                   matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) 
        
                   indices.append(dense_mxidx) 
        
                   is_cat.append(False) 
        
               if len(sparse_dfcols) > 0: 
        
                   sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)} 
        
                   full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo() 
        
                   matrices.append(SparseMatrix(full_sparse, dtype=dtype)) 
        
                   indices.append(sparse_mxidx) 
        
                   is_cat.append(False) 
        
               if cat_position == "end": 
        
                   new_indices = [] 
        
                   for mat_indices, is_cat_ in zip(indices, is_cat): 
        
                       if is_cat_: 
        
                           new_indices.append(np.asarray(mat_indices) + mxcolidx) 
        
                           mxcolidx += len(mat_indices) 
        
                       else: 
        
                           new_indices.append(mat_indices) 
        
                   indices = new_indices 
        
               if len(matrices) > 1: 
        
                   return SplitMatrix(matrices, indices) 
        
               elif len(matrices) == 0: 
        
                   raise ValueError("DataFrame contained no valid column") 
        
               else: 
        
                   return matrices[0]

that constructs a SplitMatrix from a pandas data frame.

It would great to also support polars (or arrow tables more generally given that it's easy to move from polars to arrow). This would open the door to having glum support polars data frames as input.

As a reference, Oliver Borchert (@borchero) just added arrow support to LightGBM: lightgbm-org/LightGBM#6034

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create SplitMatrix from polars data frame #329

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	def from_pandas(
	df: pd.DataFrame,
	dtype: np.dtype = np.float64,
	sparse_threshold: float = 0.1,
	cat_threshold: int = 4,
	object_as_cat: bool = False,
	cat_position: str = "expand",
	drop_first: bool = False,
	) -> MatrixBase:
	"""
	Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
	will be the primary way to construct tabmat objects from their data.

	Parameters
	----------
	df : pd.DataFrame
	pandas DataFrame to be converted.
	dtype : np.dtype, default np.float64
	dtype of all sub-matrices of the resulting SplitMatrix.
	sparse_threshold : float, default 0.1
	Density threshold below which numerical columns will be stored in a sparse
	format.
	cat_threshold : int, default 4
	Number of levels of a categorical column under which the column will be stored
	as sparse one-hot-encoded columns instead of CategoricalMatrix
	object_as_cat : bool, default False
	If True, DataFrame columns stored as python objects will be treated as
	categorical columns.
	cat_position : str {'end'\|'expand'}, default 'expand'
	Position of the categorical variable in the index. If "last", all the
	categoricals (including the ones that did not satisfy cat_threshold)
	will be placed at the end of the index list. If "expand", all the variables
	will remain in the same order.
	drop_first : bool, default False
	If true, categoricals variables will have their first category dropped.
	This allows multiple categorical variables to be included in an
	unregularized model. If False, all categories are included.

	Returns
	-------
	SplitMatrix
	"""
	matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
	indices: list[list[int]] = []
	is_cat: list[bool] = []

	dense_dfidx = [] # column index in original DataFrame
	dense_mxidx = [] # index in the new SplitMatrix
	sparse_dfcols = [] # sparse columns to join together
	sparse_mxidx = [] # index in the new SplitMatrix
	ignored_cols = []

	mxcolidx = 0

	for dfcolidx, (colname, coldata) in enumerate(df.items()):
	# categorical
	if object_as_cat and coldata.dtype == object:
	coldata = coldata.astype("category")
	if isinstance(coldata.dtype, pd.CategoricalDtype):
	cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
	if len(coldata.cat.categories) < cat_threshold:
	(
	X_dense_F,
	X_sparse,
	dense_indices,
	sparse_indices,
	) = _split_sparse_and_dense_parts(
	sps.csc_matrix(cat.tocsr(), dtype=dtype),
	threshold=sparse_threshold,
	)
	matrices.append(X_dense_F)
	is_cat.append(True)
	matrices.append(X_sparse)
	is_cat.append(True)
	if cat_position == "expand":
	indices.append(mxcolidx + dense_indices)
	indices.append(mxcolidx + sparse_indices)
	mxcolidx += len(dense_indices) + len(sparse_indices)
	elif cat_position == "end":
	indices.append(dense_indices)
	indices.append(sparse_indices)

	else:
	matrices.append(cat)
	is_cat.append(True)
	if cat_position == "expand":
	indices.append(mxcolidx + np.arange(cat.shape[1]))
	mxcolidx += cat.shape[1]
	elif cat_position == "end":
	indices.append(np.arange(cat.shape[1]))
	# All other numerical dtypes (needs to be after pd.SparseDtype)
	elif is_numeric_dtype(coldata):
	# check if we want to store as sparse
	if (coldata != 0).mean() <= sparse_threshold:
	if not isinstance(coldata.dtype, pd.SparseDtype):
	sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0)
	sparse_dfcols.append(coldata.astype(sparse_dtype))
	else:
	sparse_dfcols.append(coldata)
	sparse_mxidx.append(mxcolidx)
	mxcolidx += 1
	else:
	dense_dfidx.append(dfcolidx)
	dense_mxidx.append(mxcolidx)
	mxcolidx += 1

	# dtype not handled yet
	else:
	ignored_cols.append((dfcolidx, colname))

	if len(ignored_cols) > 0:
	warnings.warn(
	f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
	)
	if len(dense_dfidx) > 0:
	matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
	indices.append(dense_mxidx)
	is_cat.append(False)
	if len(sparse_dfcols) > 0:
	sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
	full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
	matrices.append(SparseMatrix(full_sparse, dtype=dtype))
	indices.append(sparse_mxidx)
	is_cat.append(False)

	if cat_position == "end":
	new_indices = []
	for mat_indices, is_cat_ in zip(indices, is_cat):
	if is_cat_:
	new_indices.append(np.asarray(mat_indices) + mxcolidx)
	mxcolidx += len(mat_indices)
	else:
	new_indices.append(mat_indices)
	indices = new_indices

	if len(matrices) > 1:
	return SplitMatrix(matrices, indices)
	elif len(matrices) == 0:
	raise ValueError("DataFrame contained no valid column")
	else:
	return matrices[0]

Create SplitMatrix from polars data frame #329

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions