From 1ca8ea05821156f5c610b2e7498e8f9168aed8bd Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 Nov 2019 14:54:25 +0000 Subject: [PATCH 1/5] BUG: TypeError when loc-indexing on a CategoricalIndex with integer categories --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/category.py | 7 ++++-- pandas/tests/indexing/test_categorical.py | 26 +++++++++++++++++++++++ pandas/tests/indexing/test_floats.py | 7 +++++- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3e72072eae303..67c906503251f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -742,6 +742,7 @@ Indexing - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) +- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with integer or float categories, a ValueError was raised (:issue:`17569`) - :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) - Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d061f61effff3..320524c085938 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -696,8 +696,11 @@ def get_indexer_non_unique(self, target): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - if self.categories._defer_to_indexing: - return self.categories._convert_scalar_indexer(key, kind=kind) + if kind == "loc" or self.categories._defer_to_indexing: + try: + return self.categories._convert_scalar_indexer(key, kind=kind) + except TypeError: + self._invalid_indexer("label", key=key) return super()._convert_scalar_indexer(key, kind=kind) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index ab3b0ed13b5c0..818f8f3f58ebf 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -754,3 +754,29 @@ def test_map_with_dict_or_series(self): output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) + + @pytest.mark.parametrize( + "idx_values", [[1, 2, 3], [-1, -2, -3], [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5]] + ) + def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): + # GH-17569 + cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) + cat = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) + # scalar + result = cat.loc[idx_values[0]] + expected = Series(["foo"], index=["A"], name=idx_values[0]) + tm.assert_series_equal(result, expected) + # list + result = cat.loc[idx_values[:2]] + expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) + tm.assert_frame_equal(result, expected) + # scalar assignment + result = cat.copy() + result.loc[idx_values[0]] = "qux" + expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) + # list assignment + result = cat.copy() + result.loc[idx_values[:2], "A"] = ["qux", "qux2"] + expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 0a3b513ff0167..cdacecc6c79d3 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -100,7 +100,12 @@ def test_scalar_non_numeric(self): idxr(s)[3.0] # label based can be a TypeError or KeyError - if s.index.inferred_type in ["string", "unicode", "mixed"]: + if s.index.inferred_type in { + "categorical", + "string", + "unicode", + "mixed", + }: error = KeyError msg = r"^3$" else: From a91e21b6f36156d4eb9d1b9df0b4c639ce694908 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 30 Nov 2019 09:03:49 +0000 Subject: [PATCH 2/5] Add test for wrong category type --- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/category.py | 3 +-- pandas/tests/indexing/test_categorical.py | 7 +++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1587d97ffb52c..0f97bb327f525 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2953,11 +2953,11 @@ def _convert_scalar_indexer(self, key, kind=None): "unicode", "mixed", ]: - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) elif kind in ["loc"] and is_integer(key): if not self.holds_integer(): - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) return key diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 320524c085938..b90e5a89418cd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -700,8 +700,7 @@ def _convert_scalar_indexer(self, key, kind=None): try: return self.categories._convert_scalar_indexer(key, kind=kind) except TypeError: - self._invalid_indexer("label", key=key) - + self._invalid_indexer("label", key) return super()._convert_scalar_indexer(key, kind=kind) @Appender(_index_shared_docs["_convert_list_indexer"]) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 818f8f3f58ebf..c9f1a30c2b9fd 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -80,6 +80,13 @@ def test_loc_scalar(self): with pytest.raises(TypeError, match=msg): df.loc["d", "C"] = 10 + msg = ( + r"cannot do label indexing on with these indexers \[1\] of " + ) + with pytest.raises(TypeError, match=msg): + df.loc[1] + def test_getitem_scalar(self): cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) From c52d7e0137fde0579035b4a476a02f4d0180d044 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 7 Dec 2019 22:41:45 +0000 Subject: [PATCH 3/5] remove defer_to_indexing + add tests --- pandas/core/indexes/category.py | 2 +- pandas/tests/indexing/test_categorical.py | 64 ++++++++++++++++++++--- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b90e5a89418cd..2cc853ecf568b 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -696,7 +696,7 @@ def get_indexer_non_unique(self, target): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - if kind == "loc" or self.categories._defer_to_indexing: + if kind == "loc": try: return self.categories._convert_scalar_indexer(key, kind=kind) except TypeError: diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c9f1a30c2b9fd..7c5f6dc400216 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -12,6 +12,7 @@ Index, Interval, Series, + Timedelta, Timestamp, ) from pandas.api.types import CategoricalDtype as CDT @@ -763,27 +764,74 @@ def test_map_with_dict_or_series(self): tm.assert_index_equal(expected, output) @pytest.mark.parametrize( - "idx_values", [[1, 2, 3], [-1, -2, -3], [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5]] + "idx_values", + [ + # python types + [1, 2, 3], + [-1, -2, -3], + [1.5, 2.5, 3.5], + [-1.5, -2.5, -3.5], + # numpy int/uint + *[ + np.array([1, 2, 3], dtype=dtype) + for dtype in [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ] + ], + # numpy floats + *[ + np.array([1.5, 2.5, 3.5], dtype=dtype) + for dtype in (np.float16, np.float32, np.float64) + ], + # pandas scalars + [Interval(1, 4), Interval(4, 6), Interval(6, 9)], + [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], + [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + # pandas Integer arrays + *[ + pd.array([1, 2, 3], dtype=dtype) + for dtype in [ + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt32", + "UInt64", + ] + ], + # other pandas arrays + pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, + pd.date_range("2019-01-01", periods=3).array, + pd.timedelta_range(start="1d", periods=3).array, + ], ) def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): # GH-17569 cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) - cat = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) - # scalar - result = cat.loc[idx_values[0]] + df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) + # scalar selection + result = df.loc[idx_values[0]] expected = Series(["foo"], index=["A"], name=idx_values[0]) tm.assert_series_equal(result, expected) - # list - result = cat.loc[idx_values[:2]] + # list selection + result = df.loc[idx_values[:2]] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # scalar assignment - result = cat.copy() + result = df.copy() result.loc[idx_values[0]] = "qux" expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # list assignment - result = cat.copy() + result = df.copy() result.loc[idx_values[:2], "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) From 7316414bc410db6aab8fe65d9f3de56ba16925c6 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 11 Dec 2019 05:24:34 +0000 Subject: [PATCH 4/5] use CONFTEST dtypes in parametrization --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/tests/indexing/test_categorical.py | 39 ++++++----------------- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 67c906503251f..afecc7fa4e41b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -742,7 +742,7 @@ Indexing - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) -- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with integer or float categories, a ValueError was raised (:issue:`17569`) +- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with integer and float categories, a ValueError was raised (:issue:`17569`) - :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) - Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 7c5f6dc400216..59a07ee28cc34 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd +from pandas import conftest from pandas import ( Categorical, CategoricalIndex, @@ -772,41 +773,17 @@ def test_map_with_dict_or_series(self): [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5], # numpy int/uint - *[ - np.array([1, 2, 3], dtype=dtype) - for dtype in [ - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ] - ], + *[np.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_INT_DTYPES], # numpy floats - *[ - np.array([1.5, 2.5, 3.5], dtype=dtype) - for dtype in (np.float16, np.float32, np.float64) - ], + *[np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in conftest.FLOAT_DTYPES], + # numpy object + np.array([1, "b", 3.5], dtype=object), # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], # pandas Integer arrays - *[ - pd.array([1, 2, 3], dtype=dtype) - for dtype in [ - "Int8", - "Int16", - "Int32", - "Int64", - "UInt8", - "UInt32", - "UInt64", - ] - ], + *[pd.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_EA_INT_DTYPES], # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, @@ -817,19 +794,23 @@ def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): # GH-17569 cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) + # scalar selection result = df.loc[idx_values[0]] expected = Series(["foo"], index=["A"], name=idx_values[0]) tm.assert_series_equal(result, expected) + # list selection result = df.loc[idx_values[:2]] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) + # scalar assignment result = df.copy() result.loc[idx_values[0]] = "qux" expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) + # list assignment result = df.copy() result.loc[idx_values[:2], "A"] = ["qux", "qux2"] From a51edc298a3b886fd939bdf64488d9de0d5daef5 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 11 Dec 2019 07:12:56 +0000 Subject: [PATCH 5/5] isort --- pandas/tests/indexing/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 59a07ee28cc34..bc3ee1c59f76c 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -5,7 +5,6 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import conftest from pandas import ( Categorical, CategoricalIndex, @@ -15,6 +14,7 @@ Series, Timedelta, Timestamp, + conftest, ) from pandas.api.types import CategoricalDtype as CDT import pandas.util.testing as tm