From f868874e86f06dfeb62bd2ad6a060a5b188b347e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 3 Feb 2020 08:06:10 -0600 Subject: [PATCH 1/6] REGR: Fixed AssertionError in groupby Closes https://github.com/pandas-dev/pandas/issues/31522 --- doc/source/whatsnew/v1.0.1.rst | 1 + pandas/core/groupby/generic.py | 12 ++++++++-- .../tests/groupby/aggregate/test_aggregate.py | 24 +++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index 180411afb117d..2566574fc06dc 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) - Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`) - Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`) +- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`) - Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`) - Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`) - Fixed regression where setting :attr:`pd.options.display.max_colwidth` was not accepting negative integer. In addition, this behavior has been deprecated in favor of using ``None`` (:issue:`31532`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..aa9a936e8a5f6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1061,8 +1061,16 @@ def _cython_agg_blocks( else: result = cast(DataFrame, result) # unwrap DataFrame to get array - assert len(result._data.blocks) == 1 - result = result._data.blocks[0].values + if len(result._data.blocks) != 1: + # An input (P, N)-shape block was split into P + # (1, n_groups) blocks. This is problematic since it breaks + # the assumption that one input block is aggregated + # to one output block. We should be OK as long as + # the split output can be put back into a single block below + assert len(result._data.blocks) == result.shape[1] + result = np.asarray(result) + else: + result = result._data.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2d31996a8a964..f16d7e7cfa447 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -377,6 +377,30 @@ def test_agg_index_has_complex_internals(index): tm.assert_frame_equal(result, expected) +def test_agg_split_block(): + # https://github.com/pandas-dev/pandas/issues/31522 + df = pd.DataFrame( + { + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + "key3": ["three", "three", "three", "six", "six"], + "data1": [0.0, 1, 2, 3, 4], + "data2": [0.0, 1, 2, 3, 4], + } + ) + result = df.groupby("key1").min() + expected = pd.DataFrame( + { + "key2": ["one", "six"], + "key3": ["one", "six"], + "data1": [0.0, 2.0], + "data2": [0.0, 2.0], + }, + index=pd.Index(["a", "b"], name="key1"), + ) + tm.assert_frame_equal(result, expected) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) From 70608cfba409e5c287850132e2eabda1947e2b53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 3 Feb 2020 10:42:52 -0600 Subject: [PATCH 2/6] consolidate if needed --- pandas/core/groupby/generic.py | 13 ++++--------- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aa9a936e8a5f6..9db26501406ff 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1062,15 +1062,10 @@ def _cython_agg_blocks( result = cast(DataFrame, result) # unwrap DataFrame to get array if len(result._data.blocks) != 1: - # An input (P, N)-shape block was split into P - # (1, n_groups) blocks. This is problematic since it breaks - # the assumption that one input block is aggregated - # to one output block. We should be OK as long as - # the split output can be put back into a single block below - assert len(result._data.blocks) == result.shape[1] - result = np.asarray(result) - else: - result = result._data.blocks[0].values + result._consolidate_inplace() + + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index f16d7e7cfa447..ae6e9f255bfca 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -391,8 +391,8 @@ def test_agg_split_block(): result = df.groupby("key1").min() expected = pd.DataFrame( { - "key2": ["one", "six"], - "key3": ["one", "six"], + "key2": ["one", "one"], + "key3": ["six", "six"], "data1": [0.0, 2.0], "data2": [0.0, 2.0], }, From 6eeda4262bdef24482d90facb58f390354e24fdc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 4 Feb 2020 07:35:38 -0600 Subject: [PATCH 3/6] add test --- pandas/tests/groupby/aggregate/test_aggregate.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ae6e9f255bfca..9a9e15a529ed3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -401,6 +401,21 @@ def test_agg_split_block(): tm.assert_frame_equal(result, expected) +def test_agg_split_object_part_datetime(): + # https://github.com/pandas-dev/pandas/pull/31616 + df = pd.DataFrame( + { + "A": pd.date_range("2000", periods=4), + "B": ["a", "b", "c", "d"], + "C": ["b", "c", "d", "e"], + "D": pd.date_range("2000", periods=4), + } + ).astype(object) + result = df.groupby([0, 0, 0, 0]).min() + expected = pd.DataFrame({"A": [pd.Timestamp("2000")], "B": ["a"]}) + tm.assert_frame_equal(result, expected) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) From 04d2c72b1f5b727990bfff18d56ae1dc2935ca0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 4 Feb 2020 09:40:06 -0600 Subject: [PATCH 4/6] dedent --- pandas/core/groupby/generic.py | 44 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9db26501406ff..db143441fd1ff 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1022,6 +1022,9 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] + # Some object-dtype blocks might be split into List[Block[T], Block[U]] + # split_items: List[np.ndarray] = [] + no_result = object() for block in data.blocks: # Avoid inheriting result from earlier in the loop @@ -1069,28 +1072,27 @@ def _cython_agg_blocks( if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) - finally: - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except ValueError: - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) agg_blocks.append(agg_block) From 8a5db1231f368538bb3dfe0c685e76087af6a993 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 4 Feb 2020 10:01:21 -0600 Subject: [PATCH 5/6] fixup --- pandas/core/groupby/generic.py | 62 ++++++++++++------- .../tests/groupby/aggregate/test_aggregate.py | 17 ++++- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index db143441fd1ff..0493f09c233d6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1023,7 +1023,8 @@ def _cython_agg_blocks( new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] # Some object-dtype blocks might be split into List[Block[T], Block[U]] - # split_items: List[np.ndarray] = [] + split_items: List[np.ndarray] = [] + split_frames: List[DataFrame] = [] no_result = object() for block in data.blocks: @@ -1065,41 +1066,56 @@ def _cython_agg_blocks( result = cast(DataFrame, result) # unwrap DataFrame to get array if len(result._data.blocks) != 1: - result._consolidate_inplace() + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + split_items.append(locs) + split_frames.append(result) + continue assert len(result._data.blocks) == 1 result = result._data.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except ValueError: - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) + finally: + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) agg_blocks.append(agg_block) - if not agg_blocks: + if not (agg_blocks or split_frames): raise DataError("No numeric types to aggregate") + if split_items: + # Clean up the mess left over from split blocks. + for locs, result in zip(split_items, split_frames): + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) + # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9a9e15a529ed3..cdb24549008b9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -407,12 +407,23 @@ def test_agg_split_object_part_datetime(): { "A": pd.date_range("2000", periods=4), "B": ["a", "b", "c", "d"], - "C": ["b", "c", "d", "e"], - "D": pd.date_range("2000", periods=4), + "C": [1, 2, 3, 4], + "D": ["b", "c", "d", "e"], + "E": pd.date_range("2000", periods=4), + "F": [1, 2, 3, 4], } ).astype(object) result = df.groupby([0, 0, 0, 0]).min() - expected = pd.DataFrame({"A": [pd.Timestamp("2000")], "B": ["a"]}) + expected = pd.DataFrame( + { + "A": [pd.Timestamp("2000")], + "B": ["a"], + "C": [1], + "D": ["b"], + "E": [pd.Timestamp("2000")], + "F": [1], + } + ) tm.assert_frame_equal(result, expected) From 6eb1cfd3396b8eff1afe2fe4c091c3e8b077c914 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 4 Feb 2020 10:10:38 -0600 Subject: [PATCH 6/6] all split --- pandas/core/groupby/generic.py | 41 +++++++++---------- .../tests/groupby/aggregate/test_aggregate.py | 9 +--- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0493f09c233d6..f194c774cf329 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1079,28 +1079,27 @@ def _cython_agg_blocks( if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) - finally: - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except ValueError: - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) agg_blocks.append(agg_block) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index cdb24549008b9..47c7f2dc0b51a 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -384,18 +384,11 @@ def test_agg_split_block(): "key1": ["a", "a", "b", "b", "a"], "key2": ["one", "two", "one", "two", "one"], "key3": ["three", "three", "three", "six", "six"], - "data1": [0.0, 1, 2, 3, 4], - "data2": [0.0, 1, 2, 3, 4], } ) result = df.groupby("key1").min() expected = pd.DataFrame( - { - "key2": ["one", "one"], - "key3": ["six", "six"], - "data1": [0.0, 2.0], - "data2": [0.0, 2.0], - }, + {"key2": ["one", "one"], "key3": ["six", "six"]}, index=pd.Index(["a", "b"], name="key1"), ) tm.assert_frame_equal(result, expected)