diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index ab3987c8c74a4..916bcf3db9a4a 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -219,7 +219,7 @@ Other enhancements
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
- :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
- :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
-- :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`)
+- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`)
- :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`)
- :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`)
- New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 2876ec1cb5a0d..e11d420ada29f 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1272,6 +1272,7 @@ def _numba_agg_general(
func: Callable,
engine_kwargs: dict[str, bool] | None,
numba_cache_key_str: str,
+ *aggregator_args,
):
"""
Perform groupby with a standard numerical aggregation function (e.g. mean)
@@ -1291,7 +1292,7 @@ def _numba_agg_general(
aggregator = executor.generate_shared_aggregator(
func, engine_kwargs, numba_cache_key_str
)
- result = aggregator(sorted_data, starts, ends, 0)
+ result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
cache_key = (func, numba_cache_key_str)
if cache_key not in NUMBA_FUNC_CACHE:
@@ -1989,7 +1990,12 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
- def std(self, ddof: int = 1):
+ def std(
+ self,
+ ddof: int = 1,
+ engine: str | None = None,
+ engine_kwargs: dict[str, bool] | None = None,
+ ):
"""
Compute standard deviation of groups, excluding missing values.
@@ -2000,23 +2006,52 @@ def std(self, ddof: int = 1):
ddof : int, default 1
Degrees of freedom.
+ engine : str, default None
+ * ``'cython'`` : Runs the operation through C-extensions from cython.
+ * ``'numba'`` : Runs the operation through JIT compiled code from numba.
+ * ``None`` : Defaults to ``'cython'`` or globally setting
+ ``compute.use_numba``
+
+ .. versionadded:: 1.4.0
+
+ engine_kwargs : dict, default None
+ * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+ * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+ and ``parallel`` dictionary keys. The values must either be ``True`` or
+ ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+ ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
+
+ .. versionadded:: 1.4.0
+
Returns
-------
Series or DataFrame
Standard deviation of values within each group.
"""
- return self._get_cythonized_result(
- libgroupby.group_var,
- needs_counts=True,
- cython_dtype=np.dtype(np.float64),
- post_processing=lambda vals, inference: np.sqrt(vals),
- ddof=ddof,
- )
+ if maybe_use_numba(engine):
+ from pandas.core._numba.kernels import sliding_var
+
+ return np.sqrt(
+ self._numba_agg_general(sliding_var, engine_kwargs, "groupby_std", ddof)
+ )
+ else:
+ return self._get_cythonized_result(
+ libgroupby.group_var,
+ needs_counts=True,
+ cython_dtype=np.dtype(np.float64),
+ post_processing=lambda vals, inference: np.sqrt(vals),
+ ddof=ddof,
+ )
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
- def var(self, ddof: int = 1):
+ def var(
+ self,
+ ddof: int = 1,
+ engine: str | None = None,
+ engine_kwargs: dict[str, bool] | None = None,
+ ):
"""
Compute variance of groups, excluding missing values.
@@ -2027,20 +2062,46 @@ def var(self, ddof: int = 1):
ddof : int, default 1
Degrees of freedom.
+ engine : str, default None
+ * ``'cython'`` : Runs the operation through C-extensions from cython.
+ * ``'numba'`` : Runs the operation through JIT compiled code from numba.
+ * ``None`` : Defaults to ``'cython'`` or globally setting
+ ``compute.use_numba``
+
+ .. versionadded:: 1.4.0
+
+ engine_kwargs : dict, default None
+ * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+ * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+ and ``parallel`` dictionary keys. The values must either be ``True`` or
+ ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+ ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
+
+ .. versionadded:: 1.4.0
+
Returns
-------
Series or DataFrame
Variance of values within each group.
"""
- if ddof == 1:
- numeric_only = self._resolve_numeric_only(lib.no_default)
- return self._cython_agg_general(
- "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only
+ if maybe_use_numba(engine):
+ from pandas.core._numba.kernels import sliding_var
+
+ return self._numba_agg_general(
+ sliding_var, engine_kwargs, "groupby_var", ddof
)
else:
- func = lambda x: x.var(ddof=ddof)
- with self._group_selection_context():
- return self._python_agg_general(func)
+ if ddof == 1:
+ numeric_only = self._resolve_numeric_only(lib.no_default)
+ return self._cython_agg_general(
+ "var",
+ alt=lambda x: Series(x).var(ddof=ddof),
+ numeric_only=numeric_only,
+ )
+ else:
+ func = lambda x: x.var(ddof=ddof)
+ with self._group_selection_context():
+ return self._python_agg_general(func)
@final
@Substitution(name="groupby")
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
index b61f6700f8d1d..596e9e0a4de77 100644
--- a/pandas/tests/groupby/conftest.py
+++ b/pandas/tests/groupby/conftest.py
@@ -174,3 +174,17 @@ def nogil(request):
def nopython(request):
"""nopython keyword argument for numba.jit"""
return request.param
+
+
+@pytest.fixture(
+ params=[
+ ("mean", {}),
+ ("var", {"ddof": 1}),
+ ("var", {"ddof": 0}),
+ ("std", {"ddof": 1}),
+ ("std", {"ddof": 0}),
+ ]
+)
+def numba_supported_reductions(request):
+ """reductions supported with engine='numba'"""
+ return request.param
diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py
index 20fd02b21a744..6fc1f0d808bb2 100644
--- a/pandas/tests/groupby/test_numba.py
+++ b/pandas/tests/groupby/test_numba.py
@@ -13,39 +13,55 @@
@pytest.mark.filterwarnings("ignore:\n")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
class TestEngine:
- def test_cython_vs_numba_frame(self, sort, nogil, parallel, nopython):
+ def test_cython_vs_numba_frame(
+ self, sort, nogil, parallel, nopython, numba_supported_reductions
+ ):
+ func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
- result = df.groupby("a", sort=sort).mean(
- engine="numba", engine_kwargs=engine_kwargs
+ gb = df.groupby("a", sort=sort)
+ result = getattr(gb, func)(
+ engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
- expected = df.groupby("a", sort=sort).mean()
+ expected = getattr(gb, func)(**kwargs)
tm.assert_frame_equal(result, expected)
- def test_cython_vs_numba_getitem(self, sort, nogil, parallel, nopython):
+ def test_cython_vs_numba_getitem(
+ self, sort, nogil, parallel, nopython, numba_supported_reductions
+ ):
+ func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
- result = df.groupby("a", sort=sort)["c"].mean(
- engine="numba", engine_kwargs=engine_kwargs
+ gb = df.groupby("a", sort=sort)["c"]
+ result = getattr(gb, func)(
+ engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
- expected = df.groupby("a", sort=sort)["c"].mean()
+ expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
- def test_cython_vs_numba_series(self, sort, nogil, parallel, nopython):
+ def test_cython_vs_numba_series(
+ self, sort, nogil, parallel, nopython, numba_supported_reductions
+ ):
+ func, kwargs = numba_supported_reductions
ser = Series(range(3), index=[1, 2, 1], name="foo")
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
- result = ser.groupby(level=0, sort=sort).mean(
- engine="numba", engine_kwargs=engine_kwargs
+ gb = ser.groupby(level=0, sort=sort)
+ result = getattr(gb, func)(
+ engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
- expected = ser.groupby(level=0, sort=sort).mean()
+ expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
- def test_as_index_false_unsupported(self):
+ def test_as_index_false_unsupported(self, numba_supported_reductions):
+ func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+ gb = df.groupby("a", as_index=False)
with pytest.raises(NotImplementedError, match="as_index=False"):
- df.groupby("a", as_index=False).mean(engine="numba")
+ getattr(gb, func)(engine="numba", **kwargs)
- def test_axis_1_unsupported(self):
+ def test_axis_1_unsupported(self, numba_supported_reductions):
+ func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+ gb = df.groupby("a", axis=1)
with pytest.raises(NotImplementedError, match="axis=1"):
- df.groupby("a", axis=1).mean(engine="numba")
+ getattr(gb, func)(engine="numba", **kwargs)