From a2593f0e0e81612f62a6fce473e9b5c4d7151345 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Dec 2018 00:15:50 -0800 Subject: [PATCH 01/10] BUG-19214 int categoricals are formatted as ints --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/io/formats/format.py | 5 +++++ pandas/tests/arrays/categorical/test_repr.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a84fd118061bc..c3813e22dc765 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1321,6 +1321,7 @@ Categorical - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) +- Bug where an integer ``Categorical`` would be formatted as if it had floats if ``NaN`` values were present (:issue:`19214`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9dc2692f276e3..005b59abf608e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1176,6 +1176,11 @@ def _format_strings(self): na_rep=self.na_rep, digits=self.digits, space=self.space, justify=self.justify, leading_space=self.leading_space) + + if (is_categorical_dtype(values.dtype) and + is_integer_dtype(values.dtype.categories)): + # integers were coerced to float for array with NaN (GH 19214) + return [value.replace(".0", "") for value in fmt_values] return fmt_values diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 227edf60951e6..8f000157864f9 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -240,6 +240,10 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp + def test_categorical_repr_int_with_nan(self): + s = Series([1, 2, np.nan], dtype="object").astype("category") + assert ".0" not in repr(s) + def test_categorical_repr_period(self): idx = period_range('2011-01-01 09:00', freq='H', periods=5) c = Categorical(idx) From 36e16e187db7747a5bf27453d69878da15d86c6e Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Dec 2018 13:18:54 -0800 Subject: [PATCH 02/10] BUG-19214 alter categorical as well as categorical series --- pandas/io/formats/format.py | 7 +++++-- pandas/tests/arrays/categorical/test_repr.py | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 005b59abf608e..b340b4f289404 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -134,8 +134,11 @@ def _get_footer(self): return compat.text_type(footer) def _get_formatted_values(self): - return format_array(self.categorical.get_values(), None, - float_format=None, na_rep=self.na_rep) + results = format_array(self.categorical.get_values(), None, + float_format=None, na_rep=self.na_rep) + if is_integer_dtype(self.categorical.dtype.categories): + return [result.replace(".0", "") for result in results] + return results def to_string(self): categorical = self.categorical diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 8f000157864f9..bb71d6960c963 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -241,8 +241,13 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp def test_categorical_repr_int_with_nan(self): + c = Categorical([1, 2, np.nan]) + c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" + assert repr(c) == c_exp + s = Series([1, 2, np.nan], dtype="object").astype("category") - assert ".0" not in repr(s) + s_exp = """0 1\n1 2\n2 NaN\ndtype: category\nCategories (2, int64): [1, 2]""" # noqa + assert repr(s) == s_exp def test_categorical_repr_period(self): idx = period_range('2011-01-01 09:00', freq='H', periods=5) From ee22299c75caf9b50cf80f303dc084951668245c Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Dec 2018 14:21:15 -0800 Subject: [PATCH 03/10] BUG-19214 change get_values instead --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/categorical.py | 3 +++ pandas/io/formats/format.py | 12 ++---------- pandas/tests/reshape/test_concat.py | 18 ++++++++++-------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c3813e22dc765..3106f539dd626 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1321,7 +1321,7 @@ Categorical - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) -- Bug where an integer ``Categorical`` would be formatted as if it had floats if ``NaN`` values were present (:issue:`19214`) +- Bug in :meth:`Categorical.get_values` where integers would be coerced to floats with ``NaN`` values were present (:issue:`19214`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a47406cded7b4..6a6eca1d9e24c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1520,6 +1520,9 @@ def get_values(self): # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) + elif -1 in self._codes and is_integer_dtype(self.categories): + return self.categories.astype("object").take(self._codes, + fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b340b4f289404..9dc2692f276e3 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -134,11 +134,8 @@ def _get_footer(self): return compat.text_type(footer) def _get_formatted_values(self): - results = format_array(self.categorical.get_values(), None, - float_format=None, na_rep=self.na_rep) - if is_integer_dtype(self.categorical.dtype.categories): - return [result.replace(".0", "") for result in results] - return results + return format_array(self.categorical.get_values(), None, + float_format=None, na_rep=self.na_rep) def to_string(self): categorical = self.categorical @@ -1179,11 +1176,6 @@ def _format_strings(self): na_rep=self.na_rep, digits=self.digits, space=self.space, justify=self.justify, leading_space=self.leading_space) - - if (is_categorical_dtype(values.dtype) and - is_integer_dtype(values.dtype.categories)): - # integers were coerced to float for array with NaN (GH 19214) - return [value.replace(".0", "") for value in fmt_values] return fmt_values diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0706cb12ac5d0..481f9f0a56812 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -496,7 +496,7 @@ def test_concat_categorical(self): s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') - exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) @@ -516,12 +516,12 @@ def test_concat_categorical_coercion(self): s1 = pd.Series([1, 2, np.nan], dtype='category') s2 = pd.Series([2, 1, 2]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) @@ -541,11 +541,11 @@ def test_concat_categorical_coercion(self): s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([1, 3, 2]) - exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) @@ -581,11 +581,13 @@ def test_concat_categorical_3elem_coercion(self): s2 = pd.Series([2, 1, 2], dtype='category') s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], + dtype='object') tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], + dtype='object') tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) @@ -669,7 +671,7 @@ def test_concat_categorical_coercion_nan(self): s1 = pd.Series([1, np.nan], dtype='category') s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan]) + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) From e6ef56ec90d3c80a12061f674b40ffa6a858a127 Mon Sep 17 00:00:00 2001 From: Justin Zheng Date: Sun, 30 Dec 2018 14:22:54 -0800 Subject: [PATCH 04/10] fix typo in whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3106f539dd626..857b8bb448845 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1321,7 +1321,7 @@ Categorical - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) -- Bug in :meth:`Categorical.get_values` where integers would be coerced to floats with ``NaN`` values were present (:issue:`19214`) +- Bug in :meth:`Categorical.get_values` where integers would be coerced to floats if ``NaN`` values were present (:issue:`19214`) Datetimelike ^^^^^^^^^^^^ From 97472b3b9ad7e04f7b0680a52494ee0dae4436bd Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Dec 2018 20:04:41 -0800 Subject: [PATCH 05/10] BUG-19214 add warning and change check order --- doc/source/whatsnew/v0.24.0.rst | 3 ++- pandas/core/arrays/categorical.py | 4 +++- pandas/tests/arrays/categorical/test_repr.py | 3 +++ pandas/tests/reshape/test_concat.py | 5 +++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3106f539dd626..6080aea5c1389 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1321,7 +1321,7 @@ Categorical - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) -- Bug in :meth:`Categorical.get_values` where integers would be coerced to floats with ``NaN`` values were present (:issue:`19214`) +- Bug in :meth:`Categorical.get_values` where integers would be formatted as floats if ``NaN`` values were present (:issue:`19214`) Datetimelike ^^^^^^^^^^^^ @@ -1647,6 +1647,7 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) +- Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values causes them to be processed as objects (formerly coerced to floats) (:issue:`19214`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6a6eca1d9e24c..9e85c8e6c85d6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1520,7 +1520,9 @@ def get_values(self): # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) - elif -1 in self._codes and is_integer_dtype(self.categories): + elif is_integer_dtype(self.categories) and -1 in self._codes: + warn("Integer values represented as objects to accomodate NaNs", + RuntimeWarning) return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index bb71d6960c963..b952715bb1a5a 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -10,6 +10,8 @@ from pandas.core.config import option_context from pandas.tests.arrays.categorical.common import TestCategorical +import pytest + class TestCategoricalReprWithFactor(TestCategorical): @@ -240,6 +242,7 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_categorical_repr_int_with_nan(self): c = Categorical([1, 2, np.nan]) c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 481f9f0a56812..1e778552ebcd2 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -473,6 +473,7 @@ def test_concatlike_common_period_mixed_dt_to_object(self): res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical(self): # GH 13524 @@ -509,6 +510,7 @@ def test_union_categorical_same_categories_different_order(self): categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_coercion(self): # GH 13524 @@ -573,6 +575,7 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_3elem_coercion(self): # GH 13524 @@ -640,6 +643,7 @@ def test_concat_categorical_multi_coercion(self): res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_ordered(self): # GH 13524 @@ -655,6 +659,7 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_coercion_nan(self): # GH 13524 From ee5ab83afc6f533007b47ff116ebfe24d4b00570 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 31 Dec 2018 12:47:48 -0800 Subject: [PATCH 06/10] BUG-19214 fix tests --- pandas/tests/arrays/categorical/test_missing.py | 1 + pandas/tests/arrays/categorical/test_repr.py | 3 +-- pandas/tests/frame/test_missing.py | 1 + pandas/tests/reductions/test_reductions.py | 1 + pandas/tests/series/indexing/test_indexing.py | 1 + pandas/tests/series/test_analytics.py | 1 + 6 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index b4b361dabac61..29c17f867c285 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -14,6 +14,7 @@ class TestCategoricalMissing(object): + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_na_flags_int_categories(self): # #1457 diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index b952715bb1a5a..f74ef2c09efd6 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np +import pytest from pandas.compat import PY3, u @@ -10,8 +11,6 @@ from pandas.core.config import option_context from pandas.tests.arrays.categorical.common import TestCategorical -import pytest - class TestCategoricalReprWithFactor(TestCategorical): diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 200e134838949..49087296b6621 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -354,6 +354,7 @@ def test_na_actions_categorical(self): res = df.fillna("a") tm.assert_frame_equal(res, df_exp) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_fillna_categorical_nan(self): # GH 14021 # np.nan should always be a valid filler diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index d27308029fa19..3ba311cb45fb2 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -806,6 +806,7 @@ def test_mode_timedelta(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype='timedelta64[ns]') tm.assert_series_equal(result, expected2) + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ (True, Categorical([1, 2], categories=[1, 2]), Categorical(['a'], categories=[1, 'a']), diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 92c41f65eb831..1c8f26039ba17 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -496,6 +496,7 @@ def test_setitem_with_tz_dst(): tm.assert_series_equal(s, exp) +@pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_categorial_assigning_ops(): orig = Series(Categorical(["b", "b"], categories=["a", "b"])) s = orig.copy() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b5140a5319c01..00dc594bac178 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1314,6 +1314,7 @@ def test_duplicate_keep_all_ties(self): class TestCategoricalSeriesAnalytics(object): + @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_count(self): s = Series(Categorical([np.nan, 1, 2, np.nan], From 277140177b95d2197085c7c0b86fbd69cfcac3de Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 31 Dec 2018 21:35:58 -0800 Subject: [PATCH 07/10] BUG-19214 add specific section in whatsnew --- doc/source/whatsnew/v0.24.0.rst | 31 ++++++++++++++++++- pandas/core/arrays/categorical.py | 2 -- .../tests/arrays/categorical/test_missing.py | 1 - pandas/tests/arrays/categorical/test_repr.py | 2 -- pandas/tests/frame/test_missing.py | 1 - pandas/tests/reductions/test_reductions.py | 1 - pandas/tests/reshape/test_concat.py | 5 --- pandas/tests/series/indexing/test_indexing.py | 1 - pandas/tests/series/test_analytics.py | 1 - 9 files changed, 30 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 07a38640f6d52..6e3f5826b8ca9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1113,6 +1113,36 @@ cast from integer dtype to floating dtype (:issue:`22019`) ...: 'c': [1, 1, np.nan, 1, 1]}) In [4]: pd.crosstab(df.a, df.b, normalize='columns') +Formatting Categorical Integer Data With ``NaN`` Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Categorical integer data with ``NaN`` values will be formatted as integers +instead of floats. :meth:`Series.to_numpy` is not affected (:issue:`19214`) + +*Previous Behavior* + +.. code-block:: ipython + + In [3]: pd.Series([1, 2, np.nan], dtype='object').astype('category') + Out[3]: + 0 1.0 + 1 2.0 + 2 NaN + dtype: category + Categories (2, int64): [1, 2] + + In [4]: pd.Categorical([1, 2, np.nan]) + Out[4]: + [1.0, 2.0, NaN] + Categories (2, int64): [1, 2] + +*New Behavior* + +.. ipython:: python + + pd.Series([1, 2, np.nan], dtype='object').astype('category') + pd.Categorical([1, 2, np.nan]) + Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1323,7 +1353,6 @@ Categorical - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`) - Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`) -- Bug in :meth:`Categorical.get_values` where integers would be formatted as floats if ``NaN`` values were present (:issue:`19214`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9e85c8e6c85d6..47fe2aa0b93fc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1521,8 +1521,6 @@ def get_values(self): if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) elif is_integer_dtype(self.categories) and -1 in self._codes: - warn("Integer values represented as objects to accomodate NaNs", - RuntimeWarning) return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 29c17f867c285..b4b361dabac61 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -14,7 +14,6 @@ class TestCategoricalMissing(object): - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_na_flags_int_categories(self): # #1457 diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index f74ef2c09efd6..bb71d6960c963 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import numpy as np -import pytest from pandas.compat import PY3, u @@ -241,7 +240,6 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_categorical_repr_int_with_nan(self): c = Categorical([1, 2, np.nan]) c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 49087296b6621..200e134838949 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -354,7 +354,6 @@ def test_na_actions_categorical(self): res = df.fillna("a") tm.assert_frame_equal(res, df_exp) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_fillna_categorical_nan(self): # GH 14021 # np.nan should always be a valid filler diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 3ba311cb45fb2..d27308029fa19 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -806,7 +806,6 @@ def test_mode_timedelta(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype='timedelta64[ns]') tm.assert_series_equal(result, expected2) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ (True, Categorical([1, 2], categories=[1, 2]), Categorical(['a'], categories=[1, 'a']), diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 1e778552ebcd2..481f9f0a56812 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -473,7 +473,6 @@ def test_concatlike_common_period_mixed_dt_to_object(self): res = pd.concat([tds, ps1]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical(self): # GH 13524 @@ -510,7 +509,6 @@ def test_union_categorical_same_categories_different_order(self): categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_coercion(self): # GH 13524 @@ -575,7 +573,6 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_3elem_coercion(self): # GH 13524 @@ -643,7 +640,6 @@ def test_concat_categorical_multi_coercion(self): res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) tm.assert_series_equal(res, exp) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_ordered(self): # GH 13524 @@ -659,7 +655,6 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_concat_categorical_coercion_nan(self): # GH 13524 diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 1c8f26039ba17..92c41f65eb831 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -496,7 +496,6 @@ def test_setitem_with_tz_dst(): tm.assert_series_equal(s, exp) -@pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_categorial_assigning_ops(): orig = Series(Categorical(["b", "b"], categories=["a", "b"])) s = orig.copy() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 00dc594bac178..b5140a5319c01 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1314,7 +1314,6 @@ def test_duplicate_keep_all_ties(self): class TestCategoricalSeriesAnalytics(object): - @pytest.mark.filterwarnings("ignore:Integer values:RuntimeWarning") def test_count(self): s = Series(Categorical([np.nan, 1, 2, np.nan], From 2bb35dcc6389766acf5a35741d120054f5d793f7 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 1 Jan 2019 16:26:33 -0800 Subject: [PATCH 08/10] BUG-19214 make requested changes --- doc/source/whatsnew/v0.24.0.rst | 34 ++------------------ pandas/tests/arrays/categorical/test_repr.py | 4 ++- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6e3f5826b8ca9..6dd31b175e248 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1113,36 +1113,6 @@ cast from integer dtype to floating dtype (:issue:`22019`) ...: 'c': [1, 1, np.nan, 1, 1]}) In [4]: pd.crosstab(df.a, df.b, normalize='columns') -Formatting Categorical Integer Data With ``NaN`` Values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Categorical integer data with ``NaN`` values will be formatted as integers -instead of floats. :meth:`Series.to_numpy` is not affected (:issue:`19214`) - -*Previous Behavior* - -.. code-block:: ipython - - In [3]: pd.Series([1, 2, np.nan], dtype='object').astype('category') - Out[3]: - 0 1.0 - 1 2.0 - 2 NaN - dtype: category - Categories (2, int64): [1, 2] - - In [4]: pd.Categorical([1, 2, np.nan]) - Out[4]: - [1.0, 2.0, NaN] - Categories (2, int64): [1, 2] - -*New Behavior* - -.. ipython:: python - - pd.Series([1, 2, np.nan], dtype='object').astype('category') - pd.Categorical([1, 2, np.nan]) - Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1180,6 +1150,7 @@ Other API Changes - :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`). - :func:`Series.to_list` and :func:`Index.to_list` are now aliases of ``Series.tolist`` respectively ``Index.tolist`` (:issue:`8826`) - The result of ``SparseSeries.unstack`` is now a :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (issue:`24372`). +- Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now causes them to be processed as objects in cases where previously they would be coerced to floats (:issue:`19214`) .. _whatsnew_0240.deprecations: @@ -1549,6 +1520,8 @@ MultiIndex I/O ^^^ +- Bug where integer categorical data would be formatted as floats if ``NaN`` values were present (:issue:`19214`) + .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: @@ -1683,7 +1656,6 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) -- Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now causes them to be processed as objects (formerly coerced to floats) (:issue:`19214`) - Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index bb71d6960c963..08b32a216ffb6 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -246,7 +246,9 @@ def test_categorical_repr_int_with_nan(self): assert repr(c) == c_exp s = Series([1, 2, np.nan], dtype="object").astype("category") - s_exp = """0 1\n1 2\n2 NaN\ndtype: category\nCategories (2, int64): [1, 2]""" # noqa + s_exp = """0 1\n1 2\n2 NaN +dtype: category +Categories (2, int64): [1, 2]""" assert repr(s) == s_exp def test_categorical_repr_period(self): From 40579329262a77ce999cdeccc64948ff71ee6230 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 4 Jan 2019 12:59:12 -0800 Subject: [PATCH 09/10] BUG-19214 add whatsnew section about concat --- doc/source/whatsnew/v0.24.0.rst | 34 ++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6dd31b175e248..44a0e80665cca 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1113,6 +1113,39 @@ cast from integer dtype to floating dtype (:issue:`22019`) ...: 'c': [1, 1, np.nan, 1, 1]}) In [4]: pd.crosstab(df.a, df.b, normalize='columns') +.. _whatsnew_0240.api.concat_categorical: + +Concatenation Changes +^^^^^^^^^^^^^^^^^^^^^ + +Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now +causes them to be processed as objects in cases where previously they would be +coerced to floats (:issue:`19214`) + +*Previous Behavior* + +.. code-block:: ipython + In [2]: s = pd.Series([0, 1, np.nan]) + + In [3]: c = pd.Series([0, 1, np.nan], dtype="object").astype("category") + + In [4]: pd.concat([s, c]) + Out[4]: + 0 0.0 + 1 1.0 + 2 NaN + 0 0.0 + 1 1.0 + 2 NaN + dtype: float64 + +*New Behavior* + +.. ipython:: python + s = pd.Series([0, 1, np.nan]) + c = pd.Series([0, 1, np.nan], dtype="object").astype("category") + pd.concat([s, c]) + Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1150,7 +1183,6 @@ Other API Changes - :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`). - :func:`Series.to_list` and :func:`Index.to_list` are now aliases of ``Series.tolist`` respectively ``Index.tolist`` (:issue:`8826`) - The result of ``SparseSeries.unstack`` is now a :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (issue:`24372`). -- Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now causes them to be processed as objects in cases where previously they would be coerced to floats (:issue:`19214`) .. _whatsnew_0240.deprecations: From 7cc02c1b113e83bc81fc1099787b2e1bb66d32f7 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 5 Jan 2019 13:17:48 -0800 Subject: [PATCH 10/10] BUG-19214 whatsnew changes --- doc/source/whatsnew/v0.24.0.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 257f7f75a9faf..5e349c2f06472 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1146,18 +1146,20 @@ Concatenation Changes ^^^^^^^^^^^^^^^^^^^^^ Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now -causes them to be processed as objects in cases where previously they would be -coerced to floats (:issue:`19214`) +causes them to be processed as objects when concatenating with anything +other than another ``Categorical`` of ints (:issue:`19214`) + +.. ipython:: python + + s = pd.Series([0, 1, np.nan]) + c = pd.Series([0, 1, np.nan], dtype="category") *Previous Behavior* .. code-block:: ipython - In [2]: s = pd.Series([0, 1, np.nan]) - In [3]: c = pd.Series([0, 1, np.nan], dtype="object").astype("category") - - In [4]: pd.concat([s, c]) - Out[4]: + In [3]: pd.concat([s, c]) + Out[3]: 0 0.0 1 1.0 2 NaN @@ -1169,8 +1171,7 @@ coerced to floats (:issue:`19214`) *New Behavior* .. ipython:: python - s = pd.Series([0, 1, np.nan]) - c = pd.Series([0, 1, np.nan], dtype="object").astype("category") + pd.concat([s, c]) Datetimelike API Changes