Skip to content

Commit 59b431f

Browse files
TomAugspurgerjreback
authored andcommitted
DOC/TST: Indexing with NA raises (#30308)
1 parent 7b35099 commit 59b431f

File tree

21 files changed

+304
-29
lines changed

21 files changed

+304
-29
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ repos:
1111
language: python_venv
1212
additional_dependencies: [flake8-comprehensions>=3.1.0]
1313
- repo: https://github.com/pre-commit/mirrors-isort
14-
rev: v4.3.20
14+
rev: v4.3.21
1515
hooks:
1616
- id: isort
1717
language: python_venv

asv_bench/benchmarks/indexing.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def setup(self):
131131
self.col_scalar = columns[10]
132132
self.bool_indexer = self.df[self.col_scalar] > 0
133133
self.bool_obj_indexer = self.bool_indexer.astype(object)
134+
self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
134135

135136
def time_loc(self):
136137
self.df.loc[self.idx_scalar, self.col_scalar]
@@ -144,6 +145,9 @@ def time_boolean_rows(self):
144145
def time_boolean_rows_object(self):
145146
self.df[self.bool_obj_indexer]
146147

148+
def time_boolean_rows_boolean(self):
149+
self.df[self.boolean_indexer]
150+
147151

148152
class DataFrameNumericIndexing:
149153
def setup(self):

doc/source/reference/extensions.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,11 @@ objects.
5959
api.extensions.ExtensionArray.nbytes
6060
api.extensions.ExtensionArray.ndim
6161
api.extensions.ExtensionArray.shape
62+
63+
Additionally, we have some utility methods for ensuring your object
64+
behaves correctly.
65+
66+
.. autosummary::
67+
:toctree: api/
68+
69+
api.indexers.check_bool_array_indexer

doc/source/user_guide/boolean.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,29 @@ Nullable Boolean Data Type
1414

1515
.. versionadded:: 1.0.0
1616

17+
18+
.. _boolean.indexing:
19+
20+
Indexing with NA values
21+
-----------------------
22+
23+
pandas does not allow indexing with NA values. Attempting to do so
24+
will raise a ``ValueError``.
25+
26+
.. ipython:: python
27+
:okexcept:
28+
29+
s = pd.Series([1, 2, 3])
30+
mask = pd.array([True, False, pd.NA], dtype="boolean")
31+
s[mask]
32+
33+
The missing values will need to be explicitly filled with True or False prior
34+
to using the array as a mask.
35+
36+
.. ipython:: python
37+
38+
s[mask.fillna(False)]
39+
1740
.. _boolean.kleene:
1841

1942
Kleene Logical Operations

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ Datetimelike
820820
- Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`)
821821
- Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`)
822822
- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`)
823+
- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
823824
- Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
824825
- Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
825826
- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`)

pandas/api/indexers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
"""Public API for Rolling Window Indexers"""
2+
from pandas.core.indexers import check_bool_array_indexer # noqa: F401
23
from pandas.core.window.indexers import BaseIndexer # noqa: F401

pandas/core/arrays/boolean.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from pandas.core import nanops, ops
3030
from pandas.core.algorithms import take
3131
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
32+
import pandas.core.common as com
33+
from pandas.core.indexers import check_bool_array_indexer
3234

3335
if TYPE_CHECKING:
3436
from pandas._typing import Scalar
@@ -307,11 +309,22 @@ def _from_factorized(cls, values, original: "BooleanArray"):
307309
def _formatter(self, boxed=False):
308310
return str
309311

312+
@property
313+
def _hasna(self) -> bool:
314+
# Note: this is expensive right now! The hope is that we can
315+
# make this faster by having an optional mask, but not have to change
316+
# source code using it..
317+
return self._mask.any()
318+
310319
def __getitem__(self, item):
311320
if is_integer(item):
312321
if self._mask[item]:
313322
return self.dtype.na_value
314323
return self._data[item]
324+
325+
elif com.is_bool_indexer(item):
326+
item = check_bool_array_indexer(self, item)
327+
315328
return type(self)(self._data[item], self._mask[item])
316329

317330
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
@@ -329,7 +342,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
329342
if dtype is None:
330343
dtype = object
331344
if is_bool_dtype(dtype):
332-
if not self.isna().any():
345+
if not self._hasna:
333346
return self._data
334347
else:
335348
raise ValueError(
@@ -503,7 +516,7 @@ def astype(self, dtype, copy=True):
503516

504517
if is_bool_dtype(dtype):
505518
# astype_nansafe converts np.nan to True
506-
if self.isna().any():
519+
if self._hasna:
507520
raise ValueError("cannot convert float NaN to bool")
508521
else:
509522
return self._data.astype(dtype, copy=copy)
@@ -515,7 +528,7 @@ def astype(self, dtype, copy=True):
515528
)
516529
# for integer, error if there are missing values
517530
if is_integer_dtype(dtype):
518-
if self.isna().any():
531+
if self._hasna:
519532
raise ValueError("cannot convert NA to integer")
520533
# for float dtype, ensure we use np.nan before casting (numpy cannot
521534
# deal with pd.NA)

pandas/core/arrays/categorical.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
5050
import pandas.core.common as com
5151
from pandas.core.construction import array, extract_array, sanitize_array
52+
from pandas.core.indexers import check_bool_array_indexer
5253
from pandas.core.missing import interpolate_2d
5354
from pandas.core.ops.common import unpack_zerodim_and_defer
5455
from pandas.core.sorting import nargsort
@@ -1996,10 +1997,13 @@ def __getitem__(self, key):
19961997
return np.nan
19971998
else:
19981999
return self.categories[i]
1999-
else:
2000-
return self._constructor(
2001-
values=self._codes[key], dtype=self.dtype, fastpath=True
2002-
)
2000+
2001+
elif com.is_bool_indexer(key):
2002+
key = check_bool_array_indexer(self, key)
2003+
2004+
return self._constructor(
2005+
values=self._codes[key], dtype=self.dtype, fastpath=True
2006+
)
20032007

20042008
def __setitem__(self, key, value):
20052009
"""

pandas/core/arrays/datetimelike.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from pandas.core import missing, nanops
4141
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
4242
import pandas.core.common as com
43+
from pandas.core.indexers import check_bool_array_indexer
4344
from pandas.core.ops.common import unpack_zerodim_and_defer
4445
from pandas.core.ops.invalid import make_invalid_op
4546

@@ -436,7 +437,7 @@ def __getitem__(self, key):
436437
return type(self)(val, dtype=self.dtype)
437438

438439
if com.is_bool_indexer(key):
439-
key = np.asarray(key, dtype=bool)
440+
key = check_bool_array_indexer(self, key)
440441
if key.all():
441442
key = slice(0, None, None)
442443
else:

pandas/core/arrays/integer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from pandas.core import nanops, ops
2727
from pandas.core.algorithms import take
2828
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
29+
import pandas.core.common as com
30+
from pandas.core.indexers import check_bool_array_indexer
2931
from pandas.core.ops import invalid_comparison
3032
from pandas.core.ops.common import unpack_zerodim_and_defer
3133
from pandas.core.tools.numeric import to_numeric
@@ -368,6 +370,10 @@ def __getitem__(self, item):
368370
if self._mask[item]:
369371
return self.dtype.na_value
370372
return self._data[item]
373+
374+
elif com.is_bool_indexer(item):
375+
item = check_bool_array_indexer(self, item)
376+
371377
return type(self)(self._data[item], self._mask[item])
372378

373379
def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):

0 commit comments

Comments
 (0)