1111import numpy as np
1212
1313from pandas .types .missing import isnull , notnull
14- from pandas .types .cast import _maybe_upcast
14+ from pandas .types .cast import _maybe_upcast , _find_common_type
1515from pandas .types .common import _ensure_platform_int
1616
1717from pandas .core .common import _try_sort
2525 create_block_manager_from_arrays )
2626import pandas .core .generic as generic
2727from pandas .sparse .series import SparseSeries , SparseArray
28+ from pandas ._sparse import BlockIndex , get_blocks
2829from pandas .util .decorators import Appender
2930import pandas .core .ops as ops
3031
32+ try :
33+ from scipy .sparse import spmatrix # noqa
34+ except ImportError :
35+ spmatrix = type ('mock spmatrix' , (), {})
3136
3237_shared_doc_kwargs = dict (klass = 'SparseDataFrame' )
3338
@@ -39,7 +44,7 @@ class SparseDataFrame(DataFrame):
3944
4045 Parameters
4146 ----------
42- data : same types as can be passed to DataFrame
47+ data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
4348 index : array-like, optional
4449 column : array-like, optional
4550 default_kind : {'block', 'integer'}, default 'block'
@@ -85,24 +90,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
8590 self ._default_fill_value = default_fill_value
8691
8792 if isinstance (data , dict ):
88- mgr = self ._init_dict (data , index , columns )
89- if dtype is not None :
90- mgr = mgr .astype (dtype )
93+ mgr = self ._init_dict (data , index , columns , dtype = dtype )
9194 elif isinstance (data , (np .ndarray , list )):
92- mgr = self ._init_matrix (data , index , columns )
93- if dtype is not None :
94- mgr = mgr .astype (dtype )
95+ mgr = self ._init_matrix (data , index , columns , dtype = dtype )
9596 elif isinstance (data , SparseDataFrame ):
9697 mgr = self ._init_mgr (data ._data ,
9798 dict (index = index , columns = columns ),
9899 dtype = dtype , copy = copy )
99100 elif isinstance (data , DataFrame ):
100- mgr = self ._init_dict (data , data .index , data .columns )
101- if dtype is not None :
102- mgr = mgr .astype (dtype )
101+ mgr = self ._init_dict (data , data .index , data .columns , dtype = dtype )
103102 elif isinstance (data , BlockManager ):
104103 mgr = self ._init_mgr (data , axes = dict (index = index , columns = columns ),
105104 dtype = dtype , copy = copy )
105+ elif isinstance (data , spmatrix ):
106+ mgr = self ._init_spmatrix (data , index , columns , dtype = dtype )
106107 elif data is None :
107108 data = DataFrame ()
108109
@@ -175,6 +176,33 @@ def _init_dict(self, data, index, columns, dtype=None):
175176
176177 def _init_matrix (self , data , index , columns , dtype = None ):
177178 data = _prep_ndarray (data , copy = False )
179+ index , columns = self ._prep_index (data , index , columns )
180+ data = dict ([(idx , data [:, i ]) for i , idx in enumerate (columns )])
181+ return self ._init_dict (data , index , columns , dtype )
182+
183+ def _init_spmatrix (self , data , index , columns , dtype = None ):
184+ index , columns = self ._prep_index (data , index , columns )
185+ data = data .tocoo (copy = False )
186+ N = len (index )
187+ bindex = np .arange (N , dtype = np .int32 )
188+
189+ sdict = {}
190+ values = Series (data .data , index = data .row )
191+ for col , rowvals in values .groupby (data .col ):
192+ blocs , blens = get_blocks (bindex [rowvals .index ])
193+ sdict [columns [col ]] = SparseSeries (
194+ rowvals .values , index = index ,
195+ sparse_index = BlockIndex (N , blocs , blens ))
196+
197+ # Add any columns that were empty
198+ sdict .update ({column : SparseSeries (index = index ,
199+ sparse_index = BlockIndex (N , [], []))
200+ for column in columns
201+ if column not in sdict })
202+
203+ return self ._init_dict (sdict , index , columns , dtype )
204+
205+ def _prep_index (self , data , index , columns ):
178206 N , K = data .shape
179207 if index is None :
180208 index = _default_index (N )
@@ -187,9 +215,84 @@ def _init_matrix(self, data, index, columns, dtype=None):
187215 if len (index ) != N :
188216 raise ValueError ('Index length mismatch: %d vs. %d' %
189217 (len (index ), N ))
218+ return index , columns
190219
191- data = dict ([(idx , data [:, i ]) for i , idx in enumerate (columns )])
192- return self ._init_dict (data , index , columns , dtype )
220+ def as_matrix (self , columns = None , sparse = False ):
221+ """
222+ Convert the frame to its Numpy-array or SciPy sparse COO matrix
223+ representation.
224+
225+ Parameters
226+ ----------
227+ columns : list, optional, default=None
228+ If None, return all columns. Otherwise, returns specified columns.
229+ sparse : bool, optional, default=True
230+ If True, return an instance of scipy.sparse.coo_matrix instead
231+ of ndarray. If False, the result values array will be DENSE.
232+
233+ Returns
234+ -------
235+ values : ndarray or scipy.sparse.spmatrix
236+ If the caller is heterogeneous and contains booleans or objects,
237+ the result will be of dtype=object. See Notes.
238+
239+ Notes
240+ -----
241+ The dtype will be the lowest-common-denominator type (implicit
242+ upcasting); that is to say if the dtypes (even of numeric types)
243+ are mixed, the one that accommodates all will be chosen.
244+
245+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
246+ float32. By numpy.find_common_type convention, mixing int64 and
247+ and uint64 will result in a float64 dtype.
248+
249+ See Also
250+ --------
251+ pandas.SparseDataFrame.to_coo
252+ """
253+ if sparse :
254+ subdf = self if columns is None else self [columns ]
255+ return subdf .to_coo ()
256+
257+ return super (SparseDataFrame , self ).as_matrix (columns = columns )
258+
259+ def to_coo (self ):
260+ """
261+ Convert the frame to its SciPy sparse COO matrix representation.
262+
263+ Returns
264+ -------
265+ coo_matrix : scipy.sparse.spmatrix
266+ If the caller is heterogeneous and contains booleans or objects,
267+ the result will be of dtype=object. See Notes.
268+
269+ Notes
270+ -----
271+ The dtype will be the lowest-common-denominator type (implicit
272+ upcasting); that is to say if the dtypes (even of numeric types)
273+ are mixed, the one that accommodates all will be chosen.
274+
275+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
276+ float32. By numpy.find_common_type convention, mixing int64 and
277+ and uint64 will result in a float64 dtype.
278+ """
279+ try :
280+ from scipy .sparse import coo_matrix
281+ except ImportError :
282+ raise ImportError ('Scipy is not installed' )
283+
284+ cols , rows , datas = [], [], []
285+ for col , name in enumerate (self ):
286+ s = self [name ]
287+ row = s .sp_index .to_int_index ().indices
288+ cols .append (np .repeat (col , len (row )))
289+ rows .append (row )
290+ datas .append (s .sp_values )
291+
292+ cols = np .hstack (cols )
293+ rows = np .hstack (rows )
294+ datas = np .hstack (datas ).astype (_find_common_type (self .dtypes ))
295+ return coo_matrix ((datas , (rows , cols )), shape = self .shape )
193296
194297 def __array_wrap__ (self , result ):
195298 return self ._constructor (
0 commit comments