From ffd4ac44f927711cbb460dd9530dcb0eac4bc823 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 23 Sep 2021 18:19:40 -0700 Subject: [PATCH 1/5] REF: de-duplicate warning message --- pandas/core/groupby/generic.py | 28 ++++------------------ pandas/core/groupby/groupby.py | 43 ++++++++++++++-------------------- 2 files changed, 21 insertions(+), 50 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f0408db7f4ef8..77db3123dee8c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -78,6 +78,7 @@ _apply_docs, _transform_template, group_selection_context, + warn_dropping_nuisance_columns_deprecated, ) from pandas.core.indexes.api import ( Index, @@ -999,14 +1000,7 @@ def array_func(values: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(array_func, ignore_failures=True) if len(new_mgr) < len(data): - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.{how} " - "is deprecated. In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which should be " - "valid for the function.", - FutureWarning, - stacklevel=4, - ) + warn_dropping_nuisance_columns_deprecated(type(self), how) return self._wrap_agged_manager(new_mgr) @@ -1195,14 +1189,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < len(mgr): - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.{how} " - "is deprecated. In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which should be " - "valid for the transforming function.", - FutureWarning, - stacklevel=4, - ) + warn_dropping_nuisance_columns_deprecated(type(self), how) res_df = self.obj._constructor(res_mgr) if self.axis == 1: @@ -1314,14 +1301,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.transform " - "is deprecated. In a future version, a TypeError will be raised. " - "Before calling .transform, select only columns which should be " - "valid for the transforming function.", - FutureWarning, - stacklevel=5, - ) + warn_dropping_nuisance_columns_deprecated(type(self), "transform") else: inds.append(i) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b41935902b9cf..cb21c50692ac3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1422,14 +1422,7 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.agg " - "is deprecated. In a future version, a TypeError will be raised. " - "Before calling .agg, select only columns which should be " - "valid for the aggregating function.", - FutureWarning, - stacklevel=3, - ) + warn_dropping_nuisance_columns_deprecated(type(self), "agg") continue key = base.OutputKey(label=name, position=idx) @@ -2709,15 +2702,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) if not is_ser and len(res_mgr.items) != len(mgr.items): - warnings.warn( - "Dropping invalid columns in " - f"{type(self).__name__}.quantile is deprecated. " - "In a future version, a TypeError will be raised. " - "Before calling .quantile, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) + warn_dropping_nuisance_columns_deprecated(type(self), "quantile") + if len(res_mgr.items) == 0: # re-call grouped_reduce to get the desired exception message mgr.grouped_reduce(blk_func, ignore_failures=False) @@ -3150,15 +3136,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: if not is_ser and len(res_mgr.items) != len(mgr.items): howstr = how.replace("group_", "") - warnings.warn( - "Dropping invalid columns in " - f"{type(self).__name__}.{howstr} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{howstr}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=3, - ) + warn_dropping_nuisance_columns_deprecated(type(self), howstr) + if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message try: @@ -3627,3 +3606,15 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde else: mi = MultiIndex.from_product([idx, qs]) return mi + + +def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None: + warnings.warn( + "Dropping invalid columns in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) From 3b4564888ad91b7ef03a7618b976dff604e6e181 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 24 Sep 2021 10:26:50 -0700 Subject: [PATCH 2/5] standardize re-calling --- pandas/core/groupby/groupby.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cb21c50692ac3..5218419988110 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2707,6 +2707,10 @@ def blk_func(values: ArrayLike) -> ArrayLike: if len(res_mgr.items) == 0: # re-call grouped_reduce to get the desired exception message mgr.grouped_reduce(blk_func, ignore_failures=False) + # grouped_reduce _should_ raise, so this should not be reached + raise TypeError( # pragma: no cover + "All columns were dropped in grouped_reduce" + ) if is_ser: res = self._wrap_agged_manager(res_mgr) @@ -3140,13 +3144,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message - try: - mgr.grouped_reduce(blk_func, ignore_failures=False) - except Exception as err: - error_msg = str(err) - raise TypeError(error_msg) - # We should never get here - raise TypeError("All columns were dropped in grouped_reduce") + mgr.grouped_reduce(blk_func, ignore_failures=False) + # grouped_reduce _should_ raise, so this should not be reached + raise TypeError( # pragma: no cover + "All columns were dropped in grouped_reduce" + ) if is_ser: out = self._wrap_agged_manager(res_mgr) From 3b3a13bc490bb86ea35b25a2a2ca13303aa9dadc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 24 Sep 2021 14:47:41 -0700 Subject: [PATCH 3/5] tighten wrapping --- pandas/core/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5218419988110..2ddb0afca3283 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1762,8 +1762,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: # _wrap_agged_manager() returns. GH 35028 with com.temp_setattr(self, "observed", True): result = self._wrap_agged_manager(new_mgr) - if result.ndim == 1: - result.index = self.grouper.result_index + + if result.ndim == 1: + result.index = self.grouper.result_index return self._reindex_output(result, fill_value=0) @@ -3152,9 +3153,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: if is_ser: out = self._wrap_agged_manager(res_mgr) - out.index = self.grouper.result_index else: - out = type(obj)(res_mgr) + out = obj._constructor(res_mgr) return self._wrap_aggregated_output(out) From 17c1bb66ed6a60e378facaf25ec2e73afb091546 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 26 Sep 2021 16:20:18 -0700 Subject: [PATCH 4/5] REF: use grouped_reduce in _cython_agg_general --- pandas/core/groupby/generic.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 77db3123dee8c..1dbf5781637cb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -340,8 +340,7 @@ def _cython_agg_general( ): obj = self._selected_obj - objvals = obj._values - data = obj._mgr + data = self._get_data_to_aggregate() if numeric_only and not is_numeric_dtype(obj.dtype): # GH#41291 match Series behavior @@ -365,12 +364,10 @@ def array_func(values: ArrayLike) -> ArrayLike: return result - result = array_func(objvals) - - ser = self.obj._constructor( - result, index=self.grouper.result_index, name=obj.name - ) - return self._reindex_output(ser) + new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + res = self._wrap_agged_manager(new_mgr) + res.index = self.grouper.result_index + return self._reindex_output(res) def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] From a69a7be8d5d43982005f4023c50167e1603d2ca3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 26 Sep 2021 19:53:59 -0700 Subject: [PATCH 5/5] REF: share _cython_agg_general --- pandas/core/groupby/generic.py | 68 ---------------------------------- pandas/core/groupby/groupby.py | 44 +++++++++++++++++++++- 2 files changed, 43 insertions(+), 69 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1dbf5781637cb..b1f7f56aaad8c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -48,7 +48,6 @@ is_dict_like, is_integer_dtype, is_interval_dtype, - is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.missing import ( @@ -335,40 +334,6 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: output = self._reindex_output(output) return output - def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 - ): - - obj = self._selected_obj - data = self._get_data_to_aggregate() - - if numeric_only and not is_numeric_dtype(obj.dtype): - # GH#41291 match Series behavior - raise NotImplementedError( - f"{type(self).__name__}.{how} does not implement numeric_only." - ) - - # This is overkill because it is only called once, but is here to - # mirror the array_func used in DataFrameGroupBy._cython_agg_general - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self.grouper._cython_operation( - "aggregate", values, how, axis=data.ndim - 1, min_count=min_count - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) - - return result - - new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - res = self._wrap_agged_manager(new_mgr) - res.index = self.grouper.result_index - return self._reindex_output(res) - def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] ) -> Series: @@ -968,39 +933,6 @@ def _iterate_slices(self) -> Iterable[Series]: yield values - def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 - ) -> DataFrame: - # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy - - data: Manager2D = self._get_data_to_aggregate() - - if numeric_only: - data = data.get_numeric_data(copy=False) - - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self.grouper._cython_operation( - "aggregate", values, how, axis=data.ndim - 1, min_count=min_count - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) - - return result - - # TypeError -> we may have an exception in trying to aggregate - # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - - if len(new_mgr) < len(data): - warn_dropping_nuisance_columns_deprecated(type(self), how) - - return self._wrap_agged_manager(new_mgr) - def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2ddb0afca3283..3720da80ad34d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1493,10 +1493,52 @@ def _agg_py_fallback( # test_groupby_duplicate_columns with object dtype values return ensure_block_shape(res_values, ndim=ndim) + @final def _cython_agg_general( self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): - raise AbstractMethodError(self) + # Note: we never get here with how="ohlc" for DataFrameGroupBy; + # that goes through SeriesGroupBy + + data = self._get_data_to_aggregate() + is_ser = data.ndim == 1 + + if numeric_only: + if is_ser and not is_numeric_dtype(self._selected_obj.dtype): + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) + elif not is_ser: + data = data.get_numeric_data(copy=False) + + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self.grouper._cython_operation( + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) + + return result + + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + + if not is_ser and len(new_mgr) < len(data): + warn_dropping_nuisance_columns_deprecated(type(self), how) + + res = self._wrap_agged_manager(new_mgr) + if is_ser: + res.index = self.grouper.result_index + return self._reindex_output(res) + else: + return res def _cython_transform( self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs