Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ Reshaping
Sparse
^^^^^^
- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
-
- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)

ExtensionArray
^^^^^^^^^^^^^^
Expand Down
12 changes: 5 additions & 7 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
Examples
--------
>>> import scipy.sparse
>>> mat = scipy.sparse.eye(3, dtype=float)
>>> mat = scipy.sparse.eye(3, dtype=int)
>>> pd.DataFrame.sparse.from_spmatrix(mat)
0 1 2
0 1.0 0 0
1 0 1.0 0
2 0 0 1.0
0 1 0 0
1 0 1 0
2 0 0 1
"""
from pandas._libs.sparse import IntIndex

Expand All @@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
indices = data.indices
indptr = data.indptr
array_data = data.data
dtype = SparseDtype(array_data.dtype, 0)
dtype = SparseDtype(array_data.dtype)
arrays = []
for i in range(n_columns):
sl = slice(indptr[i], indptr[i + 1])
Expand Down Expand Up @@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix:
cols, rows, data = [], [], []
for col, (_, ser) in enumerate(self._parent.items()):
sp_arr = ser.array
if sp_arr.fill_value != 0:
raise ValueError("fill value must be 0 when converting to COO matrix")

row = sp_arr.sp_index.indices
cols.append(np.repeat(col, len(row)))
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.

`SparseDtype` is used as the data type for :class:`SparseArray`, enabling
``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling
more efficient storage of data that contains a significant number of
repetitive values typically represented by a fill value. It supports any
scalar dtype as the underlying data type of the non-fill values.
Expand All @@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype):
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.
depends on ``dtype``.

=========== ==========
dtype na_value
=========== ==========
float ``np.nan``
complex ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
=========== ==========

The default value may be overridden by specifying a `fill_value`.
The default value may be overridden by specifying a ``fill_value``.

Attributes
----------
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
nan
>>> na_value_for_dtype(np.dtype("float64"))
nan
>>> na_value_for_dtype(np.dtype("complex128"))
nan
>>> na_value_for_dtype(np.dtype("bool"))
False
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
Expand All @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
elif dtype.kind in "mM":
unit = np.datetime_data(dtype)[0]
return dtype.type("NaT", unit)
elif dtype.kind == "f":
elif dtype.kind in "fc":
return np.nan
elif dtype.kind in "iu":
if compat:
Expand Down
77 changes: 39 additions & 38 deletions pandas/tests/arrays/sparse/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,28 +105,36 @@ def test_accessor_raises(self):

@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
def test_from_spmatrix(self, format, labels, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")

sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
sp_dtype = SparseDtype(dtype)

mat = sp_sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
sp_mat = sp_sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels)
mat = np.eye(10, dtype=dtype)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
index=labels,
columns=labels,
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
def test_from_spmatrix_including_explicit_zero(self, format):
@pytest.mark.parametrize("dtype", [np.int64, bool])
def test_from_spmatrix_including_explicit_zero(self, format, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")

mat = sp_sparse.random(10, 2, density=0.5, format=format)
mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(mat)
dtype = SparseDtype("float64", 0.0)
expected = pd.DataFrame(mat.todense()).astype(dtype)
sp_dtype = SparseDtype(dtype)

sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype)
sp_mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(sp_mat)
mat = sp_mat.toarray()
expected = pd.DataFrame(
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value)
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
Expand All @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format):
def test_from_spmatrix_columns(self, columns):
sp_sparse = pytest.importorskip("scipy.sparse")

dtype = SparseDtype("float64", 0.0)
sp_dtype = SparseDtype(np.float64)

mat = sp_sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
sp_mat = sp_sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns)
mat = sp_mat.toarray()
expected = pd.DataFrame(
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
columns=columns,
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
"columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
)
def test_to_coo(self, colnames):
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
def test_to_coo(self, columns, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")

df = pd.DataFrame(
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
)
result = df.sparse.to_coo()
expected = sp_sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
sp_dtype = SparseDtype(dtype)

@pytest.mark.parametrize("fill_value", [1, np.nan])
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
pytest.importorskip("scipy")
df = pd.DataFrame(
{
"A": SparseArray(
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
),
"B": SparseArray(
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
),
}
)
with pytest.raises(ValueError, match="fill value must be 0"):
df.sparse.to_coo()
expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype)
mat = expected.toarray()
result = pd.DataFrame(
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
columns=columns,
dtype=sp_dtype,
).sparse.to_coo()
assert (result != expected).nnz == 0

def test_to_coo_midx_categorical(self):
# GH#50996
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/dtypes/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples():
("f2", np.nan),
("f4", np.nan),
("f8", np.nan),
# Complex
("c8", np.nan),
("c16", np.nan),
# Object
("O", np.nan),
# Interval
Expand Down
16 changes: 7 additions & 9 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series):
tm.assert_equal(result, expected)

@pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
@pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")

Expand All @@ -1296,13 +1296,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):

# regression test for GH#34526
itr_idx = range(2, rows)
result = df.loc[itr_idx].values
result = np.nan_to_num(df.loc[itr_idx].values)
expected = spmatrix.toarray()[itr_idx]
tm.assert_numpy_array_equal(result, expected)

# regression test for GH#34540
result = df.loc[itr_idx].dtypes.values
expected = np.full(cols, SparseDtype(dtype, fill_value=0))
expected = np.full(cols, SparseDtype(dtype))
tm.assert_numpy_array_equal(result, expected)

def test_loc_getitem_listlike_all_retains_sparse(self):
Expand All @@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self):
# GH34687
sp_sparse = pytest.importorskip("scipy.sparse")

df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5))
df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64))
result = df.loc[range(2)]
expected = DataFrame(
[[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]],
dtype=SparseDtype("float64", 0.0),
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]],
dtype=SparseDtype(np.int64),
)
tm.assert_frame_equal(result, expected)

result = df.loc[range(2)].loc[range(1)]
expected = DataFrame(
[[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0)
)
expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64))
tm.assert_frame_equal(result, expected)

def test_loc_getitem_sparse_series(self):
Expand Down