diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3ac9d98874f86..083c34ce4b63f 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -35,1525 +35,1564 @@ def int_frame_const_col(): return df -class TestDataFrameApply: - def test_apply(self, float_frame): - with np.errstate(all="ignore"): - # ufunc - applied = float_frame.apply(np.sqrt) - tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) - - # aggregator - applied = float_frame.apply(np.mean) - assert applied["A"] == np.mean(float_frame["A"]) - - d = float_frame.index[0] - applied = float_frame.apply(np.mean, axis=1) - assert applied[d] == np.mean(float_frame.xs(d)) - assert applied.index is float_frame.index # want this - - # invalid axis - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - msg = "No axis named 2 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: x, 2) - - # GH 9573 - df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) - df = df.apply(lambda ts: ts.astype("category")) - - assert df.shape == (4, 2) - assert isinstance(df["c0"].dtype, CategoricalDtype) - assert isinstance(df["c1"].dtype, CategoricalDtype) - - def test_apply_axis1_with_ea(self): - # GH#36785 - df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) - - def test_apply_mixed_datetimelike(self): - # mixed datetimelike - # GH 7778 - df = DataFrame( - { - "A": date_range("20130101", periods=3), - "B": pd.to_timedelta(np.arange(3), unit="s"), - } - ) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) - - def test_apply_empty(self, float_frame): - # empty - empty_frame = DataFrame() - - applied = empty_frame.apply(np.sqrt) - assert applied.empty - - applied = empty_frame.apply(np.mean) - assert applied.empty - - no_rows = float_frame[:0] - result = no_rows.apply(lambda x: x.mean()) - expected = Series(np.nan, index=float_frame.columns) - tm.assert_series_equal(result, expected) - - no_cols = float_frame.loc[:, []] - result = no_cols.apply(lambda x: x.mean(), axis=1) - expected = Series(np.nan, index=float_frame.index) - tm.assert_series_equal(result, expected) - - # GH 2476 - expected = DataFrame(index=["a"]) - result = expected.apply(lambda x: x["a"], axis=1) - tm.assert_frame_equal(expected, result) - - def test_apply_with_reduce_empty(self): - # reduce with an empty DataFrame - empty_frame = DataFrame() - - x = [] - result = empty_frame.apply(x.append, axis=1, result_type="expand") - tm.assert_frame_equal(result, empty_frame) - result = empty_frame.apply(x.append, axis=1, result_type="reduce") - expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) - tm.assert_series_equal(result, expected) - - empty_with_cols = DataFrame(columns=["a", "b", "c"]) - result = empty_with_cols.apply(x.append, axis=1, result_type="expand") - tm.assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") - expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) - tm.assert_series_equal(result, expected) - - # Ensure that x.append hasn't been called - assert x == [] - - @pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) - def test_apply_funcs_over_empty(self, func): - # GH 28213 - df = DataFrame(columns=["a", "b", "c"]) - - result = df.apply(getattr(np, func)) - expected = getattr(df, func)() - tm.assert_series_equal(result, expected) - - def test_nunique_empty(self): - # GH 28213 - df = DataFrame(columns=["a", "b", "c"]) - - result = df.nunique() - expected = Series(0, index=df.columns) - tm.assert_series_equal(result, expected) - - result = df.T.nunique() - expected = Series([], index=pd.Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) - - def test_apply_standard_nonunique(self): - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - - result = df.apply(lambda s: s[0], axis=1) - expected = Series([1, 4, 7], ["a", "a", "c"]) - tm.assert_series_equal(result, expected) - - result = df.T.apply(lambda s: s[0], axis=0) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) - @pytest.mark.parametrize( - "args,kwds", - [ - pytest.param([], {}, id="no_args_or_kwds"), - pytest.param([1], {}, id="axis_from_args"), - pytest.param([], {"axis": 1}, id="axis_from_kwds"), - pytest.param([], {"numeric_only": True}, id="optional_kwds"), - pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), - ], +def test_apply(float_frame): + with np.errstate(all="ignore"): + # ufunc + applied = float_frame.apply(np.sqrt) + tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) + + # aggregator + applied = float_frame.apply(np.mean) + assert applied["A"] == np.mean(float_frame["A"]) + + d = float_frame.index[0] + applied = float_frame.apply(np.mean, axis=1) + assert applied[d] == np.mean(float_frame.xs(d)) + assert applied.index is float_frame.index # want this + + # invalid axis + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: x, 2) + + # GH 9573 + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + df = df.apply(lambda ts: ts.astype("category")) + + assert df.shape == (4, 2) + assert isinstance(df["c0"].dtype, CategoricalDtype) + assert isinstance(df["c1"].dtype, CategoricalDtype) + + +def test_apply_axis1_with_ea(): + # GH#36785 + df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + + +def test_apply_mixed_datetimelike(): + # mixed datetimelike + # GH 7778 + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } ) - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_apply_with_string_funcs(self, request, float_frame, func, args, kwds, how): - if len(args) > 1 and how == "agg": - request.node.add_marker( - pytest.mark.xfail( - reason="agg/apply signature mismatch - agg passes 2nd " - "argument to func" - ) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + + +def test_apply_empty(float_frame): + # empty + empty_frame = DataFrame() + + applied = empty_frame.apply(np.sqrt) + assert applied.empty + + applied = empty_frame.apply(np.mean) + assert applied.empty + + no_rows = float_frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + no_cols = float_frame.loc[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=float_frame.index) + tm.assert_series_equal(result, expected) + + # GH 2476 + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1) + tm.assert_frame_equal(expected, result) + + +def test_apply_with_reduce_empty(): + # reduce with an empty DataFrame + empty_frame = DataFrame() + + x = [] + result = empty_frame.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_frame) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_with_cols) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + # Ensure that x.append hasn't been called + assert x == [] + + +@pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) +def test_apply_funcs_over_empty(func): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.apply(getattr(np, func)) + expected = getattr(df, func)() + tm.assert_series_equal(result, expected) + + +def test_nunique_empty(): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.nunique() + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.T.nunique() + expected = Series([], index=pd.Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + +def test_apply_standard_nonunique(): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + + result = df.apply(lambda s: s[0], axis=1) + expected = Series([1, 4, 7], ["a", "a", "c"]) + tm.assert_series_equal(result, expected) + + result = df.T.apply(lambda s: s[0], axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) +@pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], +) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): + if len(args) > 1 and how == "agg": + request.node.add_marker( + pytest.mark.xfail( + reason="agg/apply signature mismatch - agg passes 2nd " + "argument to func" ) - result = getattr(float_frame, how)(func, *args, **kwds) - expected = getattr(float_frame, func)(*args, **kwds) - tm.assert_series_equal(result, expected) + ) + result = getattr(float_frame, how)(func, *args, **kwds) + expected = getattr(float_frame, func)(*args, **kwds) + tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] + +@pytest.mark.parametrize( + "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] +) +def test_apply_str_axis_1_raises(how, args): + # GH 39211 - some ops don't support axis=1 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + msg = f"Operation {how} does not support axis=1" + with pytest.raises(ValueError, match=msg): + df.apply(how, axis=1, args=args) + + +def test_apply_broadcast(float_frame, int_frame_const_col): + + # scalars + result = float_frame.apply(np.mean, result_type="broadcast") + expected = DataFrame([float_frame.mean()], index=float_frame.index) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") + m = float_frame.mean(axis=1) + expected = DataFrame({c: m for c in float_frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = float_frame.apply( + lambda x: list(range(len(float_frame.columns))), + axis=1, + result_type="broadcast", ) - def test_apply_str_axis_1_raises(self, how, args): - # GH 39211 - some ops don't support axis=1 - df = DataFrame({"a": [1, 2], "b": [3, 4]}) - msg = f"Operation {how} does not support axis=1" - with pytest.raises(ValueError, match=msg): - df.apply(how, axis=1, args=args) - - def test_apply_broadcast(self, float_frame, int_frame_const_col): - - # scalars - result = float_frame.apply(np.mean, result_type="broadcast") - expected = DataFrame([float_frame.mean()], index=float_frame.index) - tm.assert_frame_equal(result, expected) + m = list(range(len(float_frame.columns))) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.mean, axis=1, result_type="broadcast") - m = float_frame.mean(axis=1) - expected = DataFrame({c: m for c in float_frame.columns}) - tm.assert_frame_equal(result, expected) + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) + m = list(range(len(float_frame.index))) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) + tm.assert_frame_equal(result, expected) - # lists - result = float_frame.apply( - lambda x: list(range(len(float_frame.columns))), - axis=1, - result_type="broadcast", - ) - m = list(range(len(float_frame.columns))) - expected = DataFrame( - [m] * len(float_frame.index), - dtype="float64", - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(result, expected) + # preserve columns + df = int_frame_const_col + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + tm.assert_frame_equal(result, df) - result = float_frame.apply( - lambda x: list(range(len(float_frame.index))), result_type="broadcast" - ) - m = list(range(len(float_frame.index))) - expected = DataFrame( - {c: m for c in float_frame.columns}, - dtype="float64", - index=float_frame.index, - ) - tm.assert_frame_equal(result, expected) + df = int_frame_const_col + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) - # preserve columns - df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") - tm.assert_frame_equal(result, df) - df = int_frame_const_col - result = df.apply( - lambda x: Series([1, 2, 3], index=list("abc")), +def test_apply_broadcast_error(int_frame_const_col): + df = int_frame_const_col + + # > 1 ndim + msg = "too many dims to broadcast" + with pytest.raises(ValueError, match=msg): + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), axis=1, result_type="broadcast", ) - expected = df.copy() - tm.assert_frame_equal(result, expected) - def test_apply_broadcast_error(self, int_frame_const_col): - df = int_frame_const_col + # cannot broadcast + msg = "cannot broadcast result" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - # > 1 ndim - msg = "too many dims to broadcast" - with pytest.raises(ValueError, match=msg): - df.apply( - lambda x: np.array([1, 2]).reshape(-1, 2), - axis=1, - result_type="broadcast", - ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - # cannot broadcast - msg = "cannot broadcast result" - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") +def test_apply_raw(float_frame, mixed_type_frame): + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 - def test_apply_raw(self, float_frame, mixed_type_frame): - def _assert_raw(x): - assert isinstance(x, np.ndarray) - assert x.ndim == 1 + float_frame.apply(_assert_raw, raw=True) + float_frame.apply(_assert_raw, axis=1, raw=True) - float_frame.apply(_assert_raw, raw=True) - float_frame.apply(_assert_raw, axis=1, raw=True) + result0 = float_frame.apply(np.mean, raw=True) + result1 = float_frame.apply(np.mean, axis=1, raw=True) - result0 = float_frame.apply(np.mean, raw=True) - result1 = float_frame.apply(np.mean, axis=1, raw=True) + expected0 = float_frame.apply(lambda x: x.values.mean()) + expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) - expected0 = float_frame.apply(lambda x: x.values.mean()) - expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) + tm.assert_series_equal(result0, expected0) + tm.assert_series_equal(result1, expected1) - tm.assert_series_equal(result0, expected0) - tm.assert_series_equal(result1, expected1) + # no reduction + result = float_frame.apply(lambda x: x * 2, raw=True) + expected = float_frame * 2 + tm.assert_frame_equal(result, expected) - # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) - expected = float_frame * 2 - tm.assert_frame_equal(result, expected) + # Mixed dtype (GH-32423) + mixed_type_frame.apply(_assert_raw, raw=True) + mixed_type_frame.apply(_assert_raw, axis=1, raw=True) - # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, raw=True) - mixed_type_frame.apply(_assert_raw, axis=1, raw=True) - def test_apply_axis1(self, float_frame): - d = float_frame.index[0] - tapplied = float_frame.apply(np.mean, axis=1) - assert tapplied[d] == np.mean(float_frame.xs(d)) - - def test_apply_mixed_dtype_corner(self): - df = DataFrame({"A": ["foo"], "B": [1.0]}) - result = df[:0].apply(np.mean, axis=1) - # the result here is actually kind of ambiguous, should it be a Series - # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype="int64")) - tm.assert_series_equal(result, expected) - - df = DataFrame({"A": ["foo"], "B": [1.0]}) - result = df.apply(lambda x: x["A"], axis=1) - expected = Series(["foo"], index=[0]) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda x: x["B"], axis=1) - expected = Series([1.0], index=[0]) - tm.assert_series_equal(result, expected) - - def test_apply_empty_infer_type(self): - no_cols = DataFrame(index=["a", "b", "c"]) - no_index = DataFrame(columns=["a", "b", "c"]) - - def _check(df, f): - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - test_res = f(np.array([], dtype="f8")) - is_reduction = not isinstance(test_res, np.ndarray) - - def _checkit(axis=0, raw=False): - result = df.apply(f, axis=axis, raw=raw) - if is_reduction: - agg_axis = df._get_agg_axis(axis) - assert isinstance(result, Series) - assert result.index is agg_axis - else: - assert isinstance(result, DataFrame) - - _checkit() - _checkit(axis=1) - _checkit(raw=True) - _checkit(axis=0, raw=True) +def test_apply_axis1(float_frame): + d = float_frame.index[0] + tapplied = float_frame.apply(np.mean, axis=1) + assert tapplied[d] == np.mean(float_frame.xs(d)) - with np.errstate(all="ignore"): - _check(no_cols, lambda x: x) - _check(no_cols, lambda x: x.mean()) - _check(no_index, lambda x: x) - _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") - assert isinstance(result, DataFrame) +def test_apply_mixed_dtype_corner(): + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=pd.Index([], dtype="int64")) + tm.assert_series_equal(result, expected) - def test_apply_with_args_kwds(self, float_frame): - def add_some(x, howmuch=0): - return x + howmuch + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) + tm.assert_series_equal(result, expected) - def agg_and_add(x, howmuch=0): - return x.mean() + howmuch + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) + tm.assert_series_equal(result, expected) - def subtract_and_divide(x, sub, divide=1): - return (x - sub) / divide - result = float_frame.apply(add_some, howmuch=2) - expected = float_frame.apply(lambda x: x + 2) - tm.assert_frame_equal(result, expected) +def test_apply_empty_infer_type(): + no_cols = DataFrame(index=["a", "b", "c"]) + no_index = DataFrame(columns=["a", "b", "c"]) - result = float_frame.apply(agg_and_add, howmuch=2) - expected = float_frame.apply(lambda x: x.mean() + 2) - tm.assert_series_equal(result, expected) + def _check(df, f): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + test_res = f(np.array([], dtype="f8")) + is_reduction = not isinstance(test_res, np.ndarray) - result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) - expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) - tm.assert_frame_equal(result, expected) + def _checkit(axis=0, raw=False): + result = df.apply(f, axis=axis, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + assert isinstance(result, Series) + assert result.index is agg_axis + else: + assert isinstance(result, DataFrame) - def test_apply_yield_list(self, float_frame): - result = float_frame.apply(list) - tm.assert_frame_equal(result, float_frame) + _checkit() + _checkit(axis=1) + _checkit(raw=True) + _checkit(axis=0, raw=True) - def test_apply_reduce_Series(self, float_frame): - float_frame["A"].iloc[::2] = np.nan - expected = float_frame.mean(1) - result = float_frame.apply(np.mean, axis=1) - tm.assert_series_equal(result, expected) + with np.errstate(all="ignore"): + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) - def test_apply_reduce_to_dict(self): - # GH 25196 37544 - data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") + assert isinstance(result, DataFrame) - result0 = data.apply(dict, axis=0) - expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) - tm.assert_series_equal(result0, expected0) - result1 = data.apply(dict, axis=1) - expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) - tm.assert_series_equal(result1, expected1) +def test_apply_with_args_kwds(float_frame): + def add_some(x, howmuch=0): + return x + howmuch - def test_apply_differently_indexed(self): - df = DataFrame(np.random.randn(20, 10)) + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch - result0 = df.apply(Series.describe, axis=0) - expected0 = DataFrame( - {i: v.describe() for i, v in df.items()}, columns=df.columns - ) - tm.assert_frame_equal(result0, expected0) - - result1 = df.apply(Series.describe, axis=1) - expected1 = DataFrame( - {i: v.describe() for i, v in df.T.items()}, columns=df.index - ).T - tm.assert_frame_equal(result1, expected1) - - def test_apply_modify_traceback(self): - data = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide - data.loc[4, "C"] = np.nan + result = float_frame.apply(add_some, howmuch=2) + expected = float_frame.apply(lambda x: x + 2) + tm.assert_frame_equal(result, expected) - def transform(row): - if row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row + result = float_frame.apply(agg_and_add, howmuch=2) + expected = float_frame.apply(lambda x: x.mean() + 2) + tm.assert_series_equal(result, expected) - def transform2(row): - if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row + result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) + tm.assert_frame_equal(result, expected) - msg = "'float' object has no attribute 'startswith'" - with pytest.raises(AttributeError, match=msg): - data.apply(transform, axis=1) - def test_apply_bug(self): +def test_apply_yield_list(float_frame): + result = float_frame.apply(list) + tm.assert_frame_equal(result, float_frame) - # GH 6125 - positions = DataFrame( - [ - [1, "ABC0", 50], - [1, "YUM0", 20], - [1, "DEF0", 20], - [2, "ABC1", 50], - [2, "YUM1", 20], - [2, "DEF1", 20], + +def test_apply_reduce_Series(float_frame): + float_frame["A"].iloc[::2] = np.nan + expected = float_frame.mean(1) + result = float_frame.apply(np.mean, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_reduce_to_dict(): + # GH 25196 37544 + data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + + result0 = data.apply(dict, axis=0) + expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) + tm.assert_series_equal(result0, expected0) + + result1 = data.apply(dict, axis=1) + expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) + tm.assert_series_equal(result1, expected1) + + +def test_apply_differently_indexed(): + df = DataFrame(np.random.randn(20, 10)) + + result0 = df.apply(Series.describe, axis=0) + expected0 = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) + tm.assert_frame_equal(result0, expected0) + + result1 = df.apply(Series.describe, axis=1) + expected1 = DataFrame( + {i: v.describe() for i, v in df.T.items()}, columns=df.index + ).T + tm.assert_frame_equal(result1, expected1) + + +def test_apply_modify_traceback(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", ], - columns=["a", "market", "position"], - ) + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) - def f(r): - return r["market"] + data.loc[4, "C"] = np.nan - expected = positions.apply(f, axis=1) + def transform(row): + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row - positions = DataFrame( - [ - [datetime(2013, 1, 1), "ABC0", 50], - [datetime(2013, 1, 2), "YUM0", 20], - [datetime(2013, 1, 3), "DEF0", 20], - [datetime(2013, 1, 4), "ABC1", 50], - [datetime(2013, 1, 5), "YUM1", 20], - [datetime(2013, 1, 6), "DEF1", 20], + def transform2(row): + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): + data.apply(transform, axis=1) + + +def test_apply_bug(): + + # GH 6125 + positions = DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + + def f(r): + return r["market"] + + expected = positions.apply(f, axis=1) + + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + result = positions.apply(f, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_convert_objects(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", ], - columns=["a", "market", "position"], - ) - result = positions.apply(f, axis=1) - tm.assert_series_equal(result, expected) - - def test_apply_convert_objects(self): - data = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) - result = data.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result._convert(datetime=True), data) + result = data.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result._convert(datetime=True), data) - def test_apply_attach_name(self, float_frame): - result = float_frame.apply(lambda x: x.name) - expected = Series(float_frame.columns, index=float_frame.columns) - tm.assert_series_equal(result, expected) - result = float_frame.apply(lambda x: x.name, axis=1) - expected = Series(float_frame.index, index=float_frame.index) - tm.assert_series_equal(result, expected) +def test_apply_attach_name(float_frame): + result = float_frame.apply(lambda x: x.name) + expected = Series(float_frame.columns, index=float_frame.columns) + tm.assert_series_equal(result, expected) - # non-reductions - result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) - expected = DataFrame( - np.tile(float_frame.columns, (len(float_frame.index), 1)), - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(result, expected) + result = float_frame.apply(lambda x: x.name, axis=1) + expected = Series(float_frame.index, index=float_frame.index) + tm.assert_series_equal(result, expected) - result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = Series( - np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() - ) - expected.index = float_frame.index - tm.assert_series_equal(result, expected) - - def test_apply_multi_index(self, float_frame): - index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) - s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) - result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) - expected = DataFrame( - [[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"] - ) - tm.assert_frame_equal(result, expected, check_like=True) - - def test_apply_dict(self): - - # GH 8735 - A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) - A_dicts = Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]) - B = DataFrame([[0, 1], [2, 3]]) - B_dicts = Series([{0: 0, 1: 2}, {0: 1, 1: 3}]) - fn = lambda x: x.to_dict() - - for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, result_type="reduce") - reduce_false = df.apply(fn, result_type="expand") - reduce_none = df.apply(fn) - - tm.assert_series_equal(reduce_true, dicts) - tm.assert_frame_equal(reduce_false, df) - tm.assert_series_equal(reduce_none, dicts) - - def test_applymap(self, float_frame): - applied = float_frame.applymap(lambda x: x * 2) - tm.assert_frame_equal(applied, float_frame * 2) - float_frame.applymap(type) - - # GH 465: function returning tuples - result = float_frame.applymap(lambda x: (x, x)) - assert isinstance(result["A"][0], tuple) - - # GH 2909: object conversion to float in constructor? - df = DataFrame(data=[1, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object - - df = DataFrame(data=[1.0, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object - - # GH 2786 - df = DataFrame(np.random.random((3, 4))) - df2 = df.copy() - cols = ["a", "a", "a", "a"] - df.columns = cols - - expected = df2.applymap(str) - expected.columns = cols - result = df.applymap(str) - tm.assert_frame_equal(result, expected) + # non-reductions + result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) - # datetime/timedelta - df["datetime"] = Timestamp("20130101") - df["timedelta"] = pd.Timedelta("1 min") - result = df.applymap(str) - for f in ["datetime", "timedelta"]: - assert result.loc[0, f] == str(df.loc[0, f]) + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) + expected.index = float_frame.index + tm.assert_series_equal(result, expected) - # GH 8222 - empty_frames = [ - DataFrame(), - DataFrame(columns=list("ABC")), - DataFrame(index=list("ABC")), - DataFrame({"A": [], "B": [], "C": []}), - ] - for frame in empty_frames: - for func in [round, lambda x: x]: - result = frame.applymap(func) - tm.assert_frame_equal(result, frame) - - def test_applymap_na_ignore(self, float_frame): - # GH 23803 - strlen_frame = float_frame.applymap(lambda x: len(str(x))) - float_frame_with_na = float_frame.copy() - mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool) - float_frame_with_na[mask] = pd.NA - strlen_frame_na_ignore = float_frame_with_na.applymap( - lambda x: len(str(x)), na_action="ignore" - ) - strlen_frame_with_na = strlen_frame.copy() - strlen_frame_with_na[mask] = pd.NA - tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) - - with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): - float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc") - - def test_applymap_box_timestamps(self): - # GH 2689, GH 2627 - ser = Series(date_range("1/1/2000", periods=10)) - - def func(x): - return (x.hour, x.day, x.month) - - # it works! - DataFrame(ser).applymap(func) - - def test_applymap_box(self): - # ufunc will not be boxed. Same test cases as the test_map_box - df = DataFrame( - { - "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], - "b": [ - Timestamp("2011-01-01", tz="US/Eastern"), - Timestamp("2011-01-02", tz="US/Eastern"), - ], - "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], - "d": [ - pd.Period("2011-01-01", freq="M"), - pd.Period("2011-01-02", freq="M"), - ], - } - ) - result = df.applymap(lambda x: type(x).__name__) - expected = DataFrame( - { - "a": ["Timestamp", "Timestamp"], - "b": ["Timestamp", "Timestamp"], - "c": ["Timedelta", "Timedelta"], - "d": ["Period", "Period"], - } - ) - tm.assert_frame_equal(result, expected) +def test_apply_multi_index(float_frame): + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"]) + tm.assert_frame_equal(result, expected, check_like=True) - def test_frame_apply_dont_convert_datetime64(self): - from pandas.tseries.offsets import BDay - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) +def test_apply_dict(): - df = df.applymap(lambda x: x + BDay()) - df = df.applymap(lambda x: x + BDay()) + # GH 8735 + A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) + A_dicts = Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]) + B = DataFrame([[0, 1], [2, 3]]) + B_dicts = Series([{0: 0, 1: 2}, {0: 1, 1: 3}]) + fn = lambda x: x.to_dict() - assert df.x1.dtype == "M8[ns]" + for df, dicts in [(A, A_dicts), (B, B_dicts)]: + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") + reduce_none = df.apply(fn) - def test_apply_non_numpy_dtype(self): - # GH 12244 - df = DataFrame( - {"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")} - ) - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) + tm.assert_series_equal(reduce_true, dicts) + tm.assert_frame_equal(reduce_false, df) + tm.assert_series_equal(reduce_none, dicts) - result = df.apply(lambda x: x + pd.Timedelta("1day")) - expected = DataFrame( - {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} - ) - tm.assert_frame_equal(result, expected) - df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) +def test_applymap(float_frame): + applied = float_frame.applymap(lambda x: x * 2) + tm.assert_frame_equal(applied, float_frame * 2) + float_frame.applymap(type) - def test_apply_dup_names_multi_agg(self): - # GH 21063 - df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) - expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) - result = df.agg(["min"]) + # GH 465: function returning tuples + result = float_frame.applymap(lambda x: (x, x)) + assert isinstance(result["A"][0], tuple) - tm.assert_frame_equal(result, expected) + # GH 2909: object conversion to float in constructor? + df = DataFrame(data=[1, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object - def test_apply_nested_result_axis_1(self): - # GH 13820 - def apply_list(row): - return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + df = DataFrame(data=[1.0, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object - df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) - result = df.apply(apply_list, axis=1) - expected = Series( - [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] - ) - tm.assert_series_equal(result, expected) + # GH 2786 + df = DataFrame(np.random.random((3, 4))) + df2 = df.copy() + cols = ["a", "a", "a", "a"] + df.columns = cols - def test_apply_noreduction_tzaware_object(self): - # https://github.com/pandas-dev/pandas/issues/31505 - df = DataFrame( - {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" - ) - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) - result = df.apply(lambda x: x.copy()) - tm.assert_frame_equal(result, df) + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + tm.assert_frame_equal(result, expected) - def test_apply_function_runs_once(self): - # https://github.com/pandas-dev/pandas/issues/30815 + # datetime/timedelta + df["datetime"] = Timestamp("20130101") + df["timedelta"] = pd.Timedelta("1 min") + result = df.applymap(str) + for f in ["datetime", "timedelta"]: + assert result.loc[0, f] == str(df.loc[0, f]) + + # GH 8222 + empty_frames = [ + DataFrame(), + DataFrame(columns=list("ABC")), + DataFrame(index=list("ABC")), + DataFrame({"A": [], "B": [], "C": []}), + ] + for frame in empty_frames: + for func in [round, lambda x: x]: + result = frame.applymap(func) + tm.assert_frame_equal(result, frame) + + +def test_applymap_na_ignore(float_frame): + # GH 23803 + strlen_frame = float_frame.applymap(lambda x: len(str(x))) + float_frame_with_na = float_frame.copy() + mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool) + float_frame_with_na[mask] = pd.NA + strlen_frame_na_ignore = float_frame_with_na.applymap( + lambda x: len(str(x)), na_action="ignore" + ) + strlen_frame_with_na = strlen_frame.copy() + strlen_frame_with_na[mask] = pd.NA + tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) - df = DataFrame({"a": [1, 2, 3]}) - names = [] # Save row names function is applied to + with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): + float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc") - def reducing_function(row): - names.append(row.name) - def non_reducing_function(row): - names.append(row.name) - return row +def test_applymap_box_timestamps(): + # GH 2689, GH 2627 + ser = Series(date_range("1/1/2000", periods=10)) - for func in [reducing_function, non_reducing_function]: - del names[:] + def func(x): + return (x.hour, x.day, x.month) - df.apply(func, axis=1) - assert names == list(df.index) + # it works! + DataFrame(ser).applymap(func) - def test_apply_raw_function_runs_once(self): - # https://github.com/pandas-dev/pandas/issues/34506 - df = DataFrame({"a": [1, 2, 3]}) - values = [] # Save row values function is applied to +def test_applymap_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + df = DataFrame( + { + "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) - def reducing_function(row): - values.extend(row) + result = df.applymap(lambda x: type(x).__name__) + expected = DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) + tm.assert_frame_equal(result, expected) - def non_reducing_function(row): - values.extend(row) - return row - for func in [reducing_function, non_reducing_function]: - del values[:] +def test_frame_apply_dont_convert_datetime64(): + from pandas.tseries.offsets import BDay - df.apply(func, raw=True, axis=1) - assert values == list(df.a.to_list()) + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) - def test_applymap_function_runs_once(self): + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) - df = DataFrame({"a": [1, 2, 3]}) - values = [] # Save values function is applied to + assert df.x1.dtype == "M8[ns]" - def reducing_function(val): - values.append(val) - def non_reducing_function(val): - values.append(val) - return val +def test_apply_non_numpy_dtype(): + # GH 12244 + df = DataFrame({"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")}) + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) - for func in [reducing_function, non_reducing_function]: - del values[:] + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) + tm.assert_frame_equal(result, expected) - df.applymap(func) - assert values == df.a.to_list() + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) - def test_apply_with_byte_string(self): - # GH 34529 - df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) - expected = DataFrame( - np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object - ) - # After we make the aply we exect a dataframe just - # like the original but with the object datatype - result = df.apply(lambda x: x.astype("object")) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) - def test_apply_category_equalness(self, val): - # Check if categorical comparisons on apply, GH 21239 - df_values = ["asd", None, 12, "asd", "cde", np.NaN] - df = DataFrame({"a": df_values}, dtype="category") +def test_apply_dup_names_multi_agg(): + # GH 21063 + df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) - result = df.a.apply(lambda x: x == val) - expected = Series( - [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" - ) - tm.assert_series_equal(result, expected) - - -class TestInferOutputShape: - # the user has supplied an opaque UDF where - # they are transforming the input that requires - # us to infer the output - - def test_infer_row_shape(self): - # GH 17437 - # if row shape is changing, infer it - df = DataFrame(np.random.rand(10, 2)) - result = df.apply(np.fft.fft, axis=0) - assert result.shape == (10, 2) - - result = df.apply(np.fft.rfft, axis=0) - assert result.shape == (6, 2) - - def test_with_dictlike_columns(self): - # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) - expected = Series([{"s": 3} for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - df["tm"] = [ - Timestamp("2017-05-01 00:00:00"), - Timestamp("2017-05-02 00:00:00"), - ] - result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) - tm.assert_series_equal(result, expected) - - # compose a series - result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) - expected = Series([{"s": 3}, {"s": 3}]) - tm.assert_series_equal(result, expected) - - # GH 18775 - df = DataFrame() - df["author"] = ["X", "Y", "Z"] - df["publisher"] = ["BBC", "NBC", "N24"] - df["date"] = pd.to_datetime( - ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] - ) - result = df.apply(lambda x: {}, axis=1) - expected = Series([{}, {}, {}]) - tm.assert_series_equal(result, expected) - - def test_with_dictlike_columns_with_infer(self): - # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - result = df.apply( - lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" - ) - expected = DataFrame({"s": [3, 3]}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - df["tm"] = [ - Timestamp("2017-05-01 00:00:00"), - Timestamp("2017-05-02 00:00:00"), - ] - result = df.apply( - lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" - ) - tm.assert_frame_equal(result, expected) - def test_with_listlike_columns(self): - # GH 17348 - df = DataFrame( - { - "a": Series(np.random.randn(4)), - "b": ["a", "list", "of", "words"], - "ts": date_range("2016-10-01", periods=4, freq="H"), - } - ) +def test_apply_nested_result_axis_1(): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] - result = df[["a", "b"]].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) - tm.assert_series_equal(result, expected) + df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) - result = df[["a", "ts"]].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) - tm.assert_series_equal(result, expected) - # GH 18919 - df = DataFrame( - {"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])} - ) - df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) - - result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) - expected = Series([[], ["q"]], index=df.index) - tm.assert_series_equal(result, expected) - - def test_infer_output_shape_columns(self): - # GH 18573 - - df = DataFrame( - { - "number": [1.0, 2.0], - "string": ["foo", "bar"], - "datetime": [ - Timestamp("2017-11-29 03:30:00"), - Timestamp("2017-11-29 03:45:00"), - ], - } - ) - result = df.apply(lambda row: (row.number, row.string), axis=1) - expected = Series([(t.number, t.string) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - def test_infer_output_shape_listlike_columns(self): - # GH 16353 - - df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) - - result = df.apply(lambda x: [1, 2, 3], axis=1) - expected = Series([[1, 2, 3] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda x: [1, 2], axis=1) - expected = Series([[1, 2] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - # GH 17970 - df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) - - result = df.apply(lambda row: np.ones(1), axis=1) - expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda row: np.ones(2), axis=1) - expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) - tm.assert_series_equal(result, expected) - - # GH 17892 - df = DataFrame( - { - "a": [ - Timestamp("2010-02-01"), - Timestamp("2010-02-04"), - Timestamp("2010-02-05"), - Timestamp("2010-02-06"), - ], - "b": [9, 5, 4, 3], - "c": [5, 3, 4, 2], - "d": [1, 2, 3, 4], - } - ) +def test_apply_noreduction_tzaware_object(): + # https://github.com/pandas-dev/pandas/issues/31505 + df = DataFrame({"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + result = df.apply(lambda x: x.copy()) + tm.assert_frame_equal(result, df) - def fun(x): - return (1, 2) - result = df.apply(fun, axis=1) - expected = Series([(1, 2) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) +def test_apply_function_runs_once(): + # https://github.com/pandas-dev/pandas/issues/30815 - def test_consistent_coerce_for_shapes(self): - # we want column names to NOT be propagated - # just because the shape matches the input shape - df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + df = DataFrame({"a": [1, 2, 3]}) + names = [] # Save row names function is applied to - result = df.apply(lambda x: [1, 2, 3], axis=1) - expected = Series([[1, 2, 3] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) + def reducing_function(row): + names.append(row.name) - result = df.apply(lambda x: [1, 2], axis=1) - expected = Series([[1, 2] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) + def non_reducing_function(row): + names.append(row.name) + return row - def test_consistent_names(self, int_frame_const_col): - # if a Series is returned, we should use the resulting index names - df = int_frame_const_col + for func in [reducing_function, non_reducing_function]: + del names[:] - result = df.apply( - lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 - ) - expected = int_frame_const_col.rename( - columns={"A": "test", "B": "other", "C": "cols"} - ) - tm.assert_frame_equal(result, expected) + df.apply(func, axis=1) + assert names == list(df.index) - result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) - expected = expected[["test", "other"]] - tm.assert_frame_equal(result, expected) - def test_result_type(self, int_frame_const_col): - # result_type should be consistent no matter which - # path we take in the code - df = int_frame_const_col +def test_apply_raw_function_runs_once(): + # https://github.com/pandas-dev/pandas/issues/34506 - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") - expected = df.copy() - expected.columns = [0, 1, 2] - tm.assert_frame_equal(result, expected) + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save row values function is applied to - result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") - expected = df[["A", "B"]].copy() - expected.columns = [0, 1] - tm.assert_frame_equal(result, expected) + def reducing_function(row): + values.extend(row) - # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") - expected = df.copy() - tm.assert_frame_equal(result, expected) + def non_reducing_function(row): + values.extend(row) + return row - columns = ["other", "col", "names"] - result = df.apply( - lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + for func in [reducing_function, non_reducing_function]: + del values[:] - # series result - result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) - expected = df.copy() - tm.assert_frame_equal(result, expected) + df.apply(func, raw=True, axis=1) + assert values == list(df.a.to_list()) - # series result with other index - columns = ["other", "col", "names"] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) - expected = df.copy() - expected.columns = columns - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("result_type", ["foo", 1]) - def test_result_type_error(self, result_type, int_frame_const_col): - # allowed result_type - df = int_frame_const_col +def test_applymap_function_runs_once(): - msg = ( - "invalid value for result_type, must be one of " - "{None, 'reduce', 'broadcast', 'expand'}" - ) - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save values function is applied to + + def reducing_function(val): + values.append(val) - @pytest.mark.parametrize( - "box", - [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], - ids=["list", "tuple", "array"], + def non_reducing_function(val): + values.append(val) + return val + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.applymap(func) + assert values == df.a.to_list() + + +def test_apply_with_byte_string(): + # GH 34529 + df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) + expected = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object) + # After we make the aply we exect a dataframe just + # like the original but with the object datatype + result = df.apply(lambda x: x.astype("object")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) +def test_apply_category_equalness(val): + # Check if categorical comparisons on apply, GH 21239 + df_values = ["asd", None, 12, "asd", "cde", np.NaN] + df = DataFrame({"a": df_values}, dtype="category") + + result = df.a.apply(lambda x: x == val) + expected = Series( + [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" ) - def test_consistency_for_boxed(self, box, int_frame_const_col): - # passing an array or list should not affect the output shape - df = int_frame_const_col + tm.assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1) - expected = Series([box([1, 2]) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") - expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) - tm.assert_frame_equal(result, expected) +# the user has supplied an opaque UDF where +# they are transforming the input that requires +# us to infer the output -class TestDataFrameAggregate: - def test_agg_transform(self, axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 +def test_infer_row_shape(): + # GH 17437 + # if row shape is changing, infer it + df = DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) - with np.errstate(all="ignore"): + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) - f_abs = np.abs(float_frame) - f_sqrt = np.sqrt(float_frame) - - # ufunc - expected = f_sqrt.copy() - result = float_frame.apply(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - - # list-like - result = float_frame.apply([np.sqrt], axis=axis) - expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["sqrt"]] - ) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both - # functions per series and then concatting - result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) - tm.assert_frame_equal(result, expected) - - def test_transform_and_agg_err(self, axis, float_frame): - # cannot both transform and agg - msg = "cannot combine transform and aggregation operations" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.agg(["max", "sqrt"], axis=axis) - - df = DataFrame({"A": range(5), "B": 5}) - - def f(): - with np.errstate(all="ignore"): - df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - - def test_demo(self): - # demonstration tests - df = DataFrame({"A": range(5), "B": 5}) - - result = df.agg(["min", "max"]) - expected = DataFrame( - {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] - ) - tm.assert_frame_equal(result, expected) - result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) - expected = DataFrame( - {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, - columns=["A", "B"], - index=["max", "min", "sum"], - ) - tm.assert_frame_equal(result.reindex_like(expected), expected) - - def test_agg_with_name_as_column_name(self): - # GH 36212 - Column name is "name" - data = {"name": ["foo", "bar"]} - df = DataFrame(data) - - # result's name should be None - result = df.agg({"name": "count"}) - expected = Series({"name": 2}) - tm.assert_series_equal(result, expected) - - # Check if name is still preserved when aggregating series instead - result = df["name"].agg({"name": "count"}) - expected = Series({"name": 2}, name="name") - tm.assert_series_equal(result, expected) - - def test_agg_multiple_mixed_no_warning(self): - # GH 20909 - mdf = DataFrame( - { - "A": [1, 2, 3], - "B": [1.0, 2.0, 3.0], - "C": ["foo", "bar", "baz"], - "D": pd.date_range("20130101", periods=3), - } - ) - expected = DataFrame( - { - "A": [1, 6], - "B": [1.0, 6.0], - "C": ["bar", "foobarbaz"], - "D": [Timestamp("2013-01-01"), pd.NaT], - }, - index=["min", "sum"], - ) - # sorted index - with tm.assert_produces_warning(None): - result = mdf.agg(["min", "sum"]) +def test_with_dictlike_columns(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - tm.assert_frame_equal(result, expected) + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(None): - result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + # compose a series + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) + tm.assert_series_equal(result, expected) - # For backwards compatibility, the result's index is - # still sorted by function name, so it's ['min', 'sum'] - # not ['sum', 'min']. - expected = expected[["D", "C", "B", "A"]] - tm.assert_frame_equal(result, expected) + # GH 18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + tm.assert_series_equal(result, expected) - def test_agg_dict_nested_renaming_depr(self): - df = DataFrame({"A": range(5), "B": 5}) +def test_with_dictlike_columns_with_infer(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + expected = DataFrame({"s": [3, 3]}) + tm.assert_frame_equal(result, expected) - # nested renaming - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + tm.assert_frame_equal(result, expected) - def test_agg_reduce(self, axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 - name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() - # all reducers - expected = pd.concat( - [ - float_frame.mean(axis=axis), - float_frame.max(axis=axis), - float_frame.sum(axis=axis), +def test_with_listlike_columns(): + # GH 17348 + df = DataFrame( + { + "a": Series(np.random.randn(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="H"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) + tm.assert_series_equal(result, expected) + + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 18919 + df = DataFrame({"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])}) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) + + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) + tm.assert_series_equal(result, expected) + + +def test_infer_output_shape_columns(): + # GH 18573 + + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + Timestamp("2017-11-29 03:30:00"), + Timestamp("2017-11-29 03:45:00"), ], - axis=1, - ) - expected.columns = ["mean", "max", "sum"] - expected = expected.T if axis in {0, "index"} else expected + } + ) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([(t.number, t.string) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - result = float_frame.agg(["mean", "max", "sum"], axis=axis) - tm.assert_frame_equal(result, expected) - # dict input with scalars - func = {name1: "mean", name2: "sum"} - result = float_frame.agg(func, axis=axis) - expected = Series( - [ - float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name2].sum(), +def test_infer_output_shape_listlike_columns(): + # GH 16353 + + df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 17970 + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + # GH 17892 + df = DataFrame( + { + "a": [ + Timestamp("2010-02-01"), + Timestamp("2010-02-04"), + Timestamp("2010-02-05"), + Timestamp("2010-02-06"), ], - index=[name1, name2], - ) - tm.assert_series_equal(result, expected) - - # dict input with lists - func = {name1: ["mean"], name2: ["sum"]} - result = float_frame.agg(func, axis=axis) - expected = DataFrame( - { - name1: Series( - [float_frame.loc(other_axis)[name1].mean()], index=["mean"] - ), - name2: Series( - [float_frame.loc(other_axis)[name2].sum()], index=["sum"] - ), - } - ) - expected = expected.T if axis in {1, "columns"} else expected - tm.assert_frame_equal(result, expected) + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) - # dict input with lists with multiple - func = {name1: ["mean", "sum"], name2: ["sum", "max"]} - result = float_frame.agg(func, axis=axis) - expected = pd.concat( - { - name1: Series( - [ - float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name1].sum(), - ], - index=["mean", "sum"], - ), - name2: Series( - [ - float_frame.loc(other_axis)[name2].sum(), - float_frame.loc(other_axis)[name2].max(), - ], - index=["sum", "max"], - ), - }, - axis=1, - ) - expected = expected.T if axis in {1, "columns"} else expected - tm.assert_frame_equal(result, expected) + def fun(x): + return (1, 2) - def test_nuiscance_columns(self): + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - # GH 15015 - df = DataFrame( - { - "A": [1, 2, 3], - "B": [1.0, 2.0, 3.0], - "C": ["foo", "bar", "baz"], - "D": pd.date_range("20130101", periods=3), - } - ) - result = df.agg("min") - expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns) - tm.assert_series_equal(result, expected) +def test_consistent_coerce_for_shapes(): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) - result = df.agg(["min"]) - expected = DataFrame( - [[1, 1.0, "bar", Timestamp("20130101")]], - index=["min"], - columns=df.columns, - ) + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +def test_consistent_names(int_frame_const_col): + # if a Series is returned, we should use the resulting index names + df = int_frame_const_col + + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] + tm.assert_frame_equal(result, expected) + + +def test_result_type(int_frame_const_col): + # result_type should be consistent no matter which + # path we take in the code + df = int_frame_const_col + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + expected = df.copy() + expected.columns = [0, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() + expected.columns = [0, 1] + tm.assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + expected = df.copy() + tm.assert_frame_equal(result, expected) + + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result with other index + columns = ["other", "col", "names"] + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) + expected = df.copy() + expected.columns = columns + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("result_type", ["foo", 1]) +def test_result_type_error(result_type, int_frame_const_col): + # allowed result_type + df = int_frame_const_col + + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + + +@pytest.mark.parametrize( + "box", + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], +) +def test_consistency_for_boxed(box, int_frame_const_col): + # passing an array or list should not affect the output shape + df = int_frame_const_col + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) + tm.assert_frame_equal(result, expected) + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + expected = f_sqrt.copy() + result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) - result = df.agg("sum") - expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) - tm.assert_series_equal(result, expected) + # list-like + result = float_frame.apply([np.sqrt], axis=axis) + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product([float_frame.index, ["sqrt"]]) + tm.assert_frame_equal(result, expected) - result = df.agg(["sum"]) - expected = DataFrame( - [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] - ) + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + result = float_frame.apply([np.abs, np.sqrt], axis=axis) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_non_callable_aggregates(self, how): - # GH 16405 - # 'size' is a property of frame/series - # validate that this is working - # GH 39116 - expand to apply - df = DataFrame( - {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} - ) +def test_transform_and_agg_err(axis, float_frame): + # cannot both transform and agg + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) - # Function aggregate - result = getattr(df, how)({"A": "count"}) - expected = Series({"A": 2}) + df = DataFrame({"A": range(5), "B": 5}) - tm.assert_series_equal(result, expected) + def f(): + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - # Non-function aggregate - result = getattr(df, how)({"A": "size"}) - expected = Series({"A": 3}) - tm.assert_series_equal(result, expected) +def test_demo(): + # demonstration tests + df = DataFrame({"A": range(5), "B": 5}) - # Mix function and non-function aggs - result1 = getattr(df, how)(["count", "size"]) - result2 = getattr(df, how)( - {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} - ) - expected = DataFrame( - { - "A": {"count": 2, "size": 3}, - "B": {"count": 2, "size": 3}, - "C": {"count": 2, "size": 3}, - } - ) + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) + tm.assert_frame_equal(result, expected) + + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) + tm.assert_frame_equal(result.reindex_like(expected), expected) - tm.assert_frame_equal(result1, result2, check_like=True) - tm.assert_frame_equal(result2, expected, check_like=True) - # Just functional string arg is same as calling df.arg() - result = getattr(df, how)("count") - expected = df.count() +def test_agg_with_name_as_column_name(): + # GH 36212 - Column name is "name" + data = {"name": ["foo", "bar"]} + df = DataFrame(data) - tm.assert_series_equal(result, expected) + # result's name should be None + result = df.agg({"name": "count"}) + expected = Series({"name": 2}) + tm.assert_series_equal(result, expected) - # Just a string attribute arg same as calling df.arg - result = getattr(df, how)("size") - expected = df.size + # Check if name is still preserved when aggregating series instead + result = df["name"].agg({"name": "count"}) + expected = Series({"name": 2}, name="name") + tm.assert_series_equal(result, expected) - assert result == expected - def test_agg_listlike_result(self): - # GH-29587 user defined function returning list-likes - df = DataFrame( - {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} - ) +def test_agg_multiple_mixed_no_warning(): + # GH 20909 + mdf = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + expected = DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + "D": [Timestamp("2013-01-01"), pd.NaT], + }, + index=["min", "sum"], + ) + # sorted index + with tm.assert_produces_warning(None): + result = mdf.agg(["min", "sum"]) - def func(group_col): - return list(group_col.dropna().unique()) + tm.assert_frame_equal(result, expected) - result = df.agg(func) - expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) - tm.assert_series_equal(result, expected) + with tm.assert_produces_warning(None): + result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) - result = df.agg([func]) - expected = expected.to_frame("func").T - tm.assert_frame_equal(result, expected) + # For backwards compatibility, the result's index is + # still sorted by function name, so it's ['min', 'sum'] + # not ['sum', 'min']. + expected = expected[["D", "C", "B", "A"]] + tm.assert_frame_equal(result, expected) + + +def test_agg_dict_nested_renaming_depr(): + + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_agg_reduce(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() - @pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), + # all reducers + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) + tm.assert_frame_equal(result, expected) + + # dict input with scalars + func = {name1: "mean", name2: "sum"} + result = float_frame.agg(func, axis=axis) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) + tm.assert_series_equal(result, expected) + + # dict input with lists + func = {name1: ["mean"], name2: ["sum"]} + result = float_frame.agg(func, axis=axis) + expected = DataFrame( + { + name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]), + name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]), + } + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + # dict input with lists with multiple + func = {name1: ["mean", "sum"], name2: ["sum", "max"]} + result = float_frame.agg(func, axis=axis) + expected = pd.concat( + { + name1: Series( [ - ("sum", Series(dtype="float64")), - ("max", Series(dtype="float64")), - ("min", Series(dtype="float64")), - ("all", Series(dtype=bool)), - ("any", Series(dtype=bool)), - ("mean", Series(dtype="float64")), - ("prod", Series(dtype="float64")), - ("std", Series(dtype="float64")), - ("var", Series(dtype="float64")), - ("median", Series(dtype="float64")), + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), ], + index=["mean", "sum"], ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), + name2: Series( [ - ("sum", Series([1.0, 3])), - ("max", Series([1.0, 2])), - ("min", Series([1.0, 1])), - ("all", Series([True, True])), - ("any", Series([True, True])), - ("mean", Series([1, 1.5])), - ("prod", Series([1.0, 2])), - ("std", Series([np.nan, 0.707107])), - ("var", Series([np.nan, 0.5])), - ("median", Series([1, 1.5])), + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), ], + index=["sum", "max"], ), - ), + }, + axis=1, ) - def test_agg_cython_table(self, df, func, expected, axis): - # GH 21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = df.agg(func, axis=axis) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), - ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), - ], - ), - ), + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + +def test_nuiscance_columns(): + + # GH 15015 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } ) - def test_agg_cython_table_transform(self, df, func, expected, axis): - # GH 21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if axis == "columns" or axis == 1: - # operating blockwise doesn't let us preserve dtypes - expected = expected.astype("float64") - - result = df.agg(func, axis=axis) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "df, func, expected", - tm.get_cython_table_params( - DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] - ), + result = df.agg("min") + expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", Timestamp("20130101")]], + index=["min"], + columns=df.columns, ) - def test_agg_cython_table_raises(self, df, func, expected, axis): - # GH 21224 - msg = "can't multiply sequence by non-int of type 'str'" - with pytest.raises(expected, match=msg): - df.agg(func, axis=axis) - - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize( - "args, kwargs", - [ - ((1, 2, 3), {}), - ((8, 7, 15), {}), - ((1, 2), {}), - ((1,), {"b": 2}), - ((), {"a": 1, "b": 2}), - ((), {"a": 2, "b": 1}), - ((), {"a": 1, "b": 2, "c": 3}), - ], + tm.assert_frame_equal(result, expected) + + result = df.agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg(["sum"]) + expected = DataFrame( + [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) - def test_agg_args_kwargs(self, axis, args, kwargs): - def f(x, a, b, c=3): - return x.sum() + (a + b) / c + tm.assert_frame_equal(result, expected) - df = DataFrame([[1, 2], [3, 4]]) - if axis == 0: - expected = Series([5.0, 7.0]) - else: - expected = Series([4.0, 8.0]) - - result = df.agg(f, axis, *args, **kwargs) - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("num_cols", [2, 3, 5]) - def test_frequency_is_original(self, num_cols): - # GH 22150 - index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) - original = index.copy() - df = DataFrame(1, index=index, columns=range(num_cols)) - df.apply(lambda x: x) - assert index.freq == original.freq - - def test_apply_datetime_tz_issue(self): - # GH 29052 - - timestamps = [ - Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), - Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), - Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), - ] - df = DataFrame(data=[0, 1, 2], index=timestamps) - result = df.apply(lambda x: x.name, axis=1) - expected = Series(index=timestamps, data=timestamps) - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) - @pytest.mark.parametrize("method", ["min", "max", "sum"]) - def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): - # GH 16832 - none_in_first_column_result = getattr(df[["A", "B"]], method)() - none_in_second_column_result = getattr(df[["B", "A"]], method)() - - tm.assert_series_equal( - none_in_first_column_result, none_in_second_column_result - ) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_non_callable_aggregates(how): + + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + # GH 39116 - expand to apply + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + + # Function aggregate + result = getattr(df, how)({"A": "count"}) + expected = Series({"A": 2}) + + tm.assert_series_equal(result, expected) + + # Non-function aggregate + result = getattr(df, how)({"A": "size"}) + expected = Series({"A": 3}) + + tm.assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = getattr(df, how)(["count", "size"]) + result2 = getattr(df, how)( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) + + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) + + # Just functional string arg is same as calling df.arg() + result = getattr(df, how)("count") + expected = df.count() + + tm.assert_series_equal(result, expected) + + # Just a string attribute arg same as calling df.arg + result = getattr(df, how)("size") + expected = df.size + + assert result == expected + + +def test_agg_listlike_result(): + # GH-29587 user defined function returning list-likes + df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]}) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), + [ + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), + ], + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), +) +def test_agg_cython_table(df, func, expected, axis): + # GH 21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = df.agg(func, axis=axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), + ], + ), + ), +) +def test_agg_cython_table_transform(df, func, expected, axis): + # GH 21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + + result = df.agg(func, axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + tm.get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), +) +def test_agg_cython_table_raises(df, func, expected, axis): + # GH 21224 + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(expected, match=msg): + df.agg(func, axis=axis) + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize( + "args, kwargs", + [ + ((1, 2, 3), {}), + ((8, 7, 15), {}), + ((1, 2), {}), + ((1,), {"b": 2}), + ((), {"a": 1, "b": 2}), + ((), {"a": 2, "b": 1}), + ((), {"a": 1, "b": 2, "c": 3}), + ], +) +def test_agg_args_kwargs(axis, args, kwargs): + def f(x, a, b, c=3): + return x.sum() + (a + b) / c + + df = DataFrame([[1, 2], [3, 4]]) + + if axis == 0: + expected = Series([5.0, 7.0]) + else: + expected = Series([4.0, 8.0]) - @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) - def test_apply_dtype(self, col): - # GH 31466 - df = DataFrame([[1.0, col]], columns=["a", "b"]) - result = df.apply(lambda x: x.dtype) - expected = df.dtypes + result = df.agg(f, axis, *args, **kwargs) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("num_cols", [2, 3, 5]) +def test_frequency_is_original(num_cols): + # GH 22150 + index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) + original = index.copy() + df = DataFrame(1, index=index, columns=range(num_cols)) + df.apply(lambda x: x) + assert index.freq == original.freq + + +def test_apply_datetime_tz_issue(): + # GH 29052 + + timestamps = [ + Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) +@pytest.mark.parametrize("method", ["min", "max", "sum"]) +def test_consistency_of_aggregates_of_columns_with_missing_values(df, method): + # GH 16832 + none_in_first_column_result = getattr(df[["A", "B"]], method)() + none_in_second_column_result = getattr(df[["B", "A"]], method)() + + tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result) + + +@pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) +def test_apply_dtype(col): + # GH 31466 + df = DataFrame([[1.0, col]], columns=["a", "b"]) + result = df.apply(lambda x: x.dtype) + expected = df.dtypes + + tm.assert_series_equal(result, expected) def test_apply_mutating(): diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 965f69753bdc7..732aff24428ac 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -5,100 +5,103 @@ import pandas._testing as tm -class TestDataFrameNamedAggregate: - def test_agg_relabel(self): - # GH 26513 - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - - # simplest case with one column, one func - result = df.agg(foo=("B", "sum")) - expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) - tm.assert_frame_equal(result, expected) - - # test on same column with different methods - result = df.agg(foo=("B", "sum"), bar=("B", "min")) - expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) - - tm.assert_frame_equal(result, expected) - - def test_agg_relabel_multi_columns_multi_methods(self): - # GH 26513, test on multiple columns with multiple methods - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg( - foo=("A", "sum"), - bar=("B", "mean"), - cat=("A", "min"), - dat=("B", "max"), - f=("A", "max"), - g=("C", "min"), - ) - expected = pd.DataFrame( - { - "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], - "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], - "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_relabel_partial_functions(self): - # GH 26513, test on partial, functools or more complex cases - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) - expected = pd.DataFrame( - {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=("A", min), - bar=("A", np.min), - cat=("B", max), - dat=("C", "min"), - f=("B", np.sum), - kk=("B", lambda x: min(x)), - ) - expected = pd.DataFrame( - { - "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], - "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_namedtuple(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - result = df.agg( - foo=pd.NamedAgg("B", "sum"), - bar=pd.NamedAgg("B", min), - cat=pd.NamedAgg(column="B", aggfunc="count"), - fft=pd.NamedAgg("B", aggfunc="max"), - ) - - expected = pd.DataFrame( - {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=pd.NamedAgg("A", "min"), - bar=pd.NamedAgg(column="B", aggfunc="max"), - cat=pd.NamedAgg(column="A", aggfunc="max"), - ) - expected = pd.DataFrame( - {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, - index=pd.Index(["foo", "bar", "cat"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_raises(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - msg = "Must provide" - - with pytest.raises(TypeError, match=msg): - df.agg() +def test_agg_relabel(): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multi_columns_multi_methods(): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_partial_functions(): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_namedtuple(): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_raises(): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index bf8311f992ea5..c450aa3a20f15 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -23,821 +23,857 @@ from pandas.core.base import SpecificationError -class TestSeriesApply: - def test_series_map_box_timedelta(self): - # GH#11349 - ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) +def test_series_map_box_timedelta(): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) - def f(x): - return x.total_seconds() + def f(x): + return x.total_seconds() - ser.map(f) - ser.apply(f) - DataFrame(ser).applymap(f) + ser.map(f) + ser.apply(f) + DataFrame(ser).applymap(f) - def test_apply(self, datetime_series): - with np.errstate(all="ignore"): - tm.assert_series_equal( - datetime_series.apply(np.sqrt), np.sqrt(datetime_series) - ) - # element-wise apply - import math +def test_apply(datetime_series): + with np.errstate(all="ignore"): + tm.assert_series_equal(datetime_series.apply(np.sqrt), np.sqrt(datetime_series)) - tm.assert_series_equal( - datetime_series.apply(math.exp), np.exp(datetime_series) - ) + # element-wise apply + import math - # empty series - s = Series(dtype=object, name="foo", index=Index([], name="bar")) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) + tm.assert_series_equal(datetime_series.apply(math.exp), np.exp(datetime_series)) - # check all metadata (GH 9322) - assert s is not rs - assert s.index is rs.index - assert s.dtype == rs.dtype - assert s.name == rs.name + # empty series + s = Series(dtype=object, name="foo", index=Index([], name="bar")) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) - # index but no data - s = Series(index=[1, 2, 3], dtype=np.float64) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) + # check all metadata (GH 9322) + assert s is not rs + assert s.index is rs.index + assert s.dtype == rs.dtype + assert s.name == rs.name - def test_apply_same_length_inference_bug(self): - s = Series([1, 2]) + # index but no data + s = Series(index=[1, 2, 3], dtype=np.float64) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) - def f(x): - return (x, x + 1) - result = s.apply(f) - expected = s.map(f) - tm.assert_series_equal(result, expected) +def test_apply_same_length_inference_bug(): + s = Series([1, 2]) - s = Series([1, 2, 3]) - result = s.apply(f) - expected = s.map(f) - tm.assert_series_equal(result, expected) + def f(x): + return (x, x + 1) - def test_apply_dont_convert_dtype(self): - s = Series(np.random.randn(10)) - - def f(x): - return x if x > 0 else np.nan - - result = s.apply(f, convert_dtype=False) - assert result.dtype == object - - def test_with_string_args(self, datetime_series): - - for arg in ["sum", "mean", "min", "max", "std"]: - result = datetime_series.apply(arg) - expected = getattr(datetime_series, arg)() - assert result == expected - - def test_apply_args(self): - s = Series(["foo,bar"]) - - result = s.apply(str.split, args=(",",)) - assert result[0] == ["foo", "bar"] - assert isinstance(result[0], list) - - def test_series_map_box_timestamps(self): - # GH#2689, GH#2627 - ser = Series(pd.date_range("1/1/2000", periods=10)) - - def func(x): - return (x.hour, x.day, x.month) - - # it works! - ser.map(func) - ser.apply(func) - - def test_apply_box(self): - # ufunc will not be boxed. Same test cases as the test_map_box - vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) - tm.assert_series_equal(res, exp) - - vals = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) - tm.assert_series_equal(res, exp) - - # timedelta - vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = Series(["Timedelta_1", "Timedelta_2"]) - tm.assert_series_equal(res, exp) - - # period - vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = Series(["Period_M", "Period_M"]) - tm.assert_series_equal(res, exp) - - def test_apply_datetimetz(self): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) - s = Series(values, name="XX") + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) - result = s.apply(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( - "Asia/Tokyo" - ) - exp = Series(exp_values, name="XX") - tm.assert_series_equal(result, exp) - - # change dtype - # GH 14506 : Returned dtype changed from int32 to int64 - result = s.apply(lambda x: x.hour) - exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) - tm.assert_series_equal(result, exp) - - # not vectorized - def f(x): - if not isinstance(x, pd.Timestamp): - raise ValueError - return str(x.tz) - - result = s.map(f) - exp = Series(["Asia/Tokyo"] * 25, name="XX") - tm.assert_series_equal(result, exp) - - def test_apply_dict_depr(self): - - tsdf = DataFrame( - np.random.randn(10, 3), - columns=["A", "B", "C"], - index=pd.date_range("1/1/2000", periods=10), - ) - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - tsdf.A.agg({"foo": ["sum", "mean"]}) - - def test_apply_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - ser = Series(values, name="XX", index=list("abcdefg")) - result = ser.apply(lambda x: x.lower()) - - # should be categorical dtype when the number of categories are - # the same - values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) - exp = Series(values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp.values) - - result = ser.apply(lambda x: "A") - exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == object - - @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) - def test_apply_categorical_with_nan_values(self, series): - # GH 20714 bug fixed in: GH 24275 - s = Series(series, dtype="category") - result = s.apply(lambda x: x.split("-")[0]) - result = result.astype(object) - expected = Series(["1", "1", np.NaN], dtype="category") - expected = expected.astype(object) - tm.assert_series_equal(result, expected) + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) - def test_apply_empty_integer_series_with_datetime_index(self): - # GH 21245 - s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) - result = s.apply(lambda x: x) - tm.assert_series_equal(result, s) +def test_apply_dont_convert_dtype(): + s = Series(np.random.randn(10)) -class TestSeriesAggregate: - def test_transform(self, string_series): - # transforming functions + def f(x): + return x if x > 0 else np.nan - with np.errstate(all="ignore"): + result = s.apply(f, convert_dtype=False) + assert result.dtype == object - f_sqrt = np.sqrt(string_series) - f_abs = np.abs(string_series) - - # ufunc - result = string_series.apply(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - - # list-like - result = string_series.apply([np.sqrt]) - expected = f_sqrt.to_frame().copy() - expected.columns = ["sqrt"] - tm.assert_frame_equal(result, expected) - - result = string_series.apply(["sqrt"]) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both functions per - # series and then concatting - expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ["sqrt", "absolute"] - result = string_series.apply([np.sqrt, np.abs]) - tm.assert_frame_equal(result, expected) - - # dict, provide renaming - expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ["foo", "bar"] - expected = expected.unstack().rename("series") - - result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) - tm.assert_series_equal(result.reindex_like(expected), expected) - - def test_transform_and_agg_error(self, string_series): - # we are trying to transform with an aggregator - msg = "cannot combine transform and aggregation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg(["sqrt", "max"]) - - msg = "cannot perform both aggregation and transformation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg({"foo": np.sqrt, "bar": "sum"}) - - def test_demo(self): - # demonstration tests - s = Series(range(6), dtype="int64", name="series") - - result = s.agg(["min", "max"]) - expected = Series([0, 5], index=["min", "max"], name="series") - tm.assert_series_equal(result, expected) - result = s.agg({"foo": "min"}) - expected = Series([0], index=["foo"], name="series") - tm.assert_series_equal(result, expected) +def test_with_string_args(datetime_series): + + for arg in ["sum", "mean", "min", "max", "std"]: + result = datetime_series.apply(arg) + expected = getattr(datetime_series, arg)() + assert result == expected - # nested renaming - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - s.agg({"foo": ["min", "max"]}) - def test_multiple_aggregators_with_dict_api(self): +def test_apply_args(): + s = Series(["foo,bar"]) + + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] + assert isinstance(result[0], list) + + +def test_series_map_box_timestamps(): + # GH#2689, GH#2627 + ser = Series(pd.date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + ser.map(func) + ser.apply(func) + + +def test_apply_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +def test_apply_datetimetz(): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = Series(values, name="XX") - s = Series(range(6), dtype="int64", name="series") - # nested renaming - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + result = s.apply(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) - def test_agg_apply_evaluate_lambdas_the_same(self, string_series): - # test that we are evaluating row-by-row first - # before vectorized evaluation - result = string_series.apply(lambda x: str(x)) - expected = string_series.agg(lambda x: str(x)) - tm.assert_series_equal(result, expected) + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.apply(lambda x: x.hour) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) - result = string_series.apply(str) - expected = string_series.agg(str) + +def test_apply_dict_depr(): + + tsdf = DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + tsdf.A.agg({"foo": ["sum", "mean"]}) + + +def test_apply_categorical(): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = Series(values, name="XX", index=list("abcdefg")) + result = ser.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object + + +@pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) +def test_apply_categorical_with_nan_values(series): + # GH 20714 bug fixed in: GH 24275 + s = Series(series, dtype="category") + result = s.apply(lambda x: x.split("-")[0]) + result = result.astype(object) + expected = Series(["1", "1", np.NaN], dtype="category") + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + +def test_apply_empty_integer_series_with_datetime_index(): + # GH 21245 + s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + result = s.apply(lambda x: x) + tm.assert_series_equal(result, s) + + +def test_transform(string_series): + # transforming functions + + with np.errstate(all="ignore"): + + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.apply(np.sqrt) + expected = f_sqrt.copy() tm.assert_series_equal(result, expected) - def test_with_nested_series(self, datetime_series): - # GH 2316 - # .agg with a reducer and a transform, what to do - result = datetime_series.apply( - lambda x: Series([x, x ** 2], index=["x", "x^2"]) - ) - expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) + # list-like + result = string_series.apply([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + result = string_series.apply(["sqrt"]) tm.assert_frame_equal(result, expected) - def test_replicate_describe(self, string_series): - # this also tests a result set that is all scalars - expected = string_series.describe() - result = string_series.apply( - { - "count": "count", - "mean": "mean", - "std": "std", - "min": "min", - "25%": lambda x: x.quantile(0.25), - "50%": "median", - "75%": lambda x: x.quantile(0.75), - "max": "max", - } - ) - tm.assert_series_equal(result, expected) + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "absolute"] + result = string_series.apply([np.sqrt, np.abs]) + tm.assert_frame_equal(result, expected) - def test_reduce(self, string_series): - # reductions with named functions - result = string_series.agg(["sum", "mean"]) - expected = Series( - [string_series.sum(), string_series.mean()], - ["sum", "mean"], - name=string_series.name, - ) - tm.assert_series_equal(result, expected) + # dict, provide renaming + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_non_callable_aggregates(self, how): - # test agg using non-callable series attributes - # GH 39116 - expand to apply - s = Series([1, 2, None]) + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) + tm.assert_series_equal(result.reindex_like(expected), expected) - # Calling agg w/ just a string arg same as calling s.arg - result = getattr(s, how)("size") - expected = s.size + +def test_transform_and_agg_error(string_series): + # we are trying to transform with an aggregator + msg = "cannot combine transform and aggregation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) + + msg = "cannot perform both aggregation and transformation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) + + +def test_demo(): + # demonstration tests + s = Series(range(6), dtype="int64", name="series") + + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") + tm.assert_series_equal(result, expected) + + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") + tm.assert_series_equal(result, expected) + + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) + + +def test_multiple_aggregators_with_dict_api(): + + s = Series(range(6), dtype="int64", name="series") + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + + +def test_agg_apply_evaluate_lambdas_the_same(string_series): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = string_series.apply(lambda x: str(x)) + expected = string_series.agg(lambda x: str(x)) + tm.assert_series_equal(result, expected) + + result = string_series.apply(str) + expected = string_series.agg(str) + tm.assert_series_equal(result, expected) + + +def test_with_nested_series(datetime_series): + # GH 2316 + # .agg with a reducer and a transform, what to do + result = datetime_series.apply(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) + tm.assert_frame_equal(result, expected) + + result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + + +def test_replicate_describe(string_series): + # this also tests a result set that is all scalars + expected = string_series.describe() + result = string_series.apply( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } + ) + tm.assert_series_equal(result, expected) + + +def test_reduce(string_series): + # reductions with named functions + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_non_callable_aggregates(how): + # test agg using non-callable series attributes + # GH 39116 - expand to apply + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = getattr(s, how)("size") + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = getattr(s, how)(["size", "count", "mean"]) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", "c"), # see GH12863 + ("any", "a"), + ], + ), + ), +) +def test_agg_cython_table(series, func, expected): + # GH21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = series.agg(func) + if is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + else: assert result == expected - # test when mixed w/ callable reducers - result = getattr(s, how)(["size", "count", "mean"]) - expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("sum", 0), - ("max", np.nan), - ("min", np.nan), - ("all", True), - ("any", False), - ("mean", np.nan), - ("prod", 1), - ("std", np.nan), - ("var", np.nan), - ("median", np.nan), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("sum", 6), - ("max", 3), - ("min", 1), - ("all", True), - ("any", True), - ("mean", 2), - ("prod", 6), - ("std", 1), - ("var", 1), - ("median", 2), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("sum", "abc"), - ("max", "c"), - ("min", "a"), - ("all", "c"), # see GH12863 - ("any", "a"), - ], - ), +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("cumprod", Series([], Index([]), dtype=np.float64)), + ("cumsum", Series([], Index([]), dtype=np.float64)), + ], ), - ) - def test_agg_cython_table(self, series, func, expected): - # GH21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = series.agg(func) - if is_number(expected): - assert np.isclose(result, expected, equal_nan=True) - else: - assert result == expected - - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("cumprod", Series([], Index([]), dtype=np.float64)), - ("cumsum", Series([], Index([]), dtype=np.float64)), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("cumprod", Series([np.nan, 1, 2, 6])), - ("cumsum", Series([np.nan, 1, 3, 6])), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] - ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], ), + tm.get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), +) +def test_agg_cython_table_transform(series, func, expected): + # GH21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + result = series.agg(func) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), +) +def test_agg_cython_table_raises(series, func, expected): + # GH21224 + msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + with pytest.raises(expected, match=msg): + # e.g. Series('a b'.split()).cumprod() will raise + series.agg(func) + + +def test_series_apply_no_suffix_index(): + # GH36189 + s = Series([4] * 3) + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = Series([12, 12, 12], index=["sum", "", ""]) + + tm.assert_series_equal(result, expected) + + +def test_map(datetime_series): + index, data = tm.getMixedTypeDict() + + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) + + merged = target.map(source) + + for k, v in merged.items(): + assert v == source[target[k]] + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in merged.items(): + assert v == source[target[k]] + + # function + result = datetime_series.map(lambda x: x * 2) + tm.assert_series_equal(result, datetime_series * 2) + + # GH 10324 + a = Series([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = Series(["odd", "even", "odd", np.nan], dtype="category") + tm.assert_series_equal(a.map(b), exp) + exp = Series(["odd", "even", "odd", np.nan]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) + + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), ) - def test_agg_cython_table_transform(self, series, func, expected): - # GH21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - result = series.agg(func) - tm.assert_series_equal(result, expected) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("mean", TypeError), # mean raises TypeError - ("prod", TypeError), - ("std", TypeError), - ("var", TypeError), - ("median", TypeError), - ("cumprod", TypeError), - ], - ) - ), + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) ) - def test_agg_cython_table_raises(self, series, func, expected): - # GH21224 - msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" - with pytest.raises(expected, match=msg): - # e.g. Series('a b'.split()).cumprod() will raise - series.agg(func) - - def test_series_apply_no_suffix_index(self): - # GH36189 - s = Series([4] * 3) - result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = Series([12, 12, 12], index=["sum", "", ""]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, "B", "C", "D"]) + tm.assert_series_equal(a.map(c), exp) - tm.assert_series_equal(result, expected) +def test_map_empty(index): + if isinstance(index, MultiIndex): + pytest.skip("Initializing a Series from a MultiIndex is not supported") -class TestSeriesMap: - def test_map(self, datetime_series): - index, data = tm.getMixedTypeDict() + s = Series(index) + result = s.map({}) - source = Series(data["B"], index=data["C"]) - target = Series(data["C"][:4], index=data["D"][:4]) + expected = Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) - merged = target.map(source) - for k, v in merged.items(): - assert v == source[target[k]] +def test_map_compat(): + # related GH 8024 + s = Series([True, True, False], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) - # input could be a dict - merged = target.map(source.to_dict()) - for k, v in merged.items(): - assert v == source[target[k]] +def test_map_int(): + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) + right = Series({1: 11, 2: 22, 3: 33}) - # function - result = datetime_series.map(lambda x: x * 2) - tm.assert_series_equal(result, datetime_series * 2) + assert left.dtype == np.float_ + assert issubclass(right.dtype.type, np.integer) - # GH 10324 - a = Series([1, 2, 3, 4]) - b = Series(["even", "odd", "even", "odd"], dtype="category") - c = Series(["even", "odd", "even", "odd"]) + merged = left.map(right) + assert merged.dtype == np.float_ + assert isna(merged["d"]) + assert not isna(merged["c"]) - exp = Series(["odd", "even", "odd", np.nan], dtype="category") - tm.assert_series_equal(a.map(b), exp) - exp = Series(["odd", "even", "odd", np.nan]) - tm.assert_series_equal(a.map(c), exp) - a = Series(["a", "b", "c", "d"]) - b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) - c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) +def test_map_type_inference(): + s = Series(range(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + assert issubclass(s2.dtype.type, np.integer) - exp = Series([np.nan, 1, 2, 3]) - tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 1, 2, 3]) - tm.assert_series_equal(a.map(c), exp) - a = Series(["a", "b", "c", "d"]) - b = Series( - ["B", "C", "D", "E"], - dtype="category", - index=pd.CategoricalIndex(["b", "c", "d", "e"]), - ) - c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) +def test_map_decimal(string_series): + from decimal import Decimal - exp = Series( - pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) - ) - tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, "B", "C", "D"]) - tm.assert_series_equal(a.map(c), exp) + result = string_series.map(lambda x: Decimal(str(x))) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) - def test_map_empty(self, index): - if isinstance(index, MultiIndex): - pytest.skip("Initializing a Series from a MultiIndex is not supported") - s = Series(index) - result = s.map({}) +def test_map_na_exclusion(): + s = Series([1.5, np.nan, 3, np.nan, 5]) - expected = Series(np.nan, index=s.index) - tm.assert_series_equal(result, expected) + result = s.map(lambda x: x * 2, na_action="ignore") + exp = s * 2 + tm.assert_series_equal(result, exp) - def test_map_compat(self): - # related GH 8024 - s = Series([True, True, False], index=[1, 2, 3]) - result = s.map({True: "foo", False: "bar"}) - expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) - tm.assert_series_equal(result, expected) - def test_map_int(self): - left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) - right = Series({1: 11, 2: 22, 3: 33}) - - assert left.dtype == np.float_ - assert issubclass(right.dtype.type, np.integer) - - merged = left.map(right) - assert merged.dtype == np.float_ - assert isna(merged["d"]) - assert not isna(merged["c"]) - - def test_map_type_inference(self): - s = Series(range(3)) - s2 = s.map(lambda x: np.where(x == 0, 0, 1)) - assert issubclass(s2.dtype.type, np.integer) - - def test_map_decimal(self, string_series): - from decimal import Decimal - - result = string_series.map(lambda x: Decimal(str(x))) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - def test_map_na_exclusion(self): - s = Series([1.5, np.nan, 3, np.nan, 5]) - - result = s.map(lambda x: x * 2, na_action="ignore") - exp = s * 2 - tm.assert_series_equal(result, exp) - - def test_map_dict_with_tuple_keys(self): - """ - Due to new MultiIndex-ing behaviour in v0.14.0, - dicts with tuple keys passed to map were being - converted to a multi-index, preventing tuple values - from being mapped properly. - """ - # GH 18496 - df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) - label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - - df["labels"] = df["a"].map(label_mappings) - df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) - # All labels should be filled now - tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) - - def test_map_counter(self): - s = Series(["a", "b", "c"], index=[1, 2, 3]) - counter = Counter() - counter["b"] = 5 - counter["c"] += 1 - result = s.map(counter) - expected = Series([0, 5, 1], index=[1, 2, 3]) - tm.assert_series_equal(result, expected) +def test_map_dict_with_tuple_keys(): + """ + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + """ + # GH 18496 + df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - def test_map_defaultdict(self): - s = Series([1, 2, 3], index=["a", "b", "c"]) - default_dict = defaultdict(lambda: "blank") - default_dict[1] = "stuff" - result = s.map(default_dict) - expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) - tm.assert_series_equal(result, expected) + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) - def test_map_dict_na_key(self): - # https://github.com/pandas-dev/pandas/issues/17648 - # Checks that np.nan key is appropriately mapped - s = Series([1, 2, np.nan]) - expected = Series(["a", "b", "c"]) - result = s.map({1: "a", 2: "b", np.nan: "c"}) - tm.assert_series_equal(result, expected) - def test_map_dict_subclass_with_missing(self): - """ - Test Series.map with a dictionary subclass that defines __missing__, - i.e. sets a default value (GH #15999). - """ +def test_map_counter(): + s = Series(["a", "b", "c"], index=[1, 2, 3]) + counter = Counter() + counter["b"] = 5 + counter["c"] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) - class DictWithMissing(dict): - def __missing__(self, key): - return "missing" - s = Series([1, 2, 3]) - dictionary = DictWithMissing({3: "three"}) - result = s.map(dictionary) - expected = Series(["missing", "missing", "three"]) - tm.assert_series_equal(result, expected) +def test_map_defaultdict(): + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" + result = s.map(default_dict) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) - def test_map_dict_subclass_without_missing(self): - class DictWithoutMissing(dict): - pass - s = Series([1, 2, 3]) - dictionary = DictWithoutMissing({3: "three"}) - result = s.map(dictionary) - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) +def test_map_dict_na_key(): + # https://github.com/pandas-dev/pandas/issues/17648 + # Checks that np.nan key is appropriately mapped + s = Series([1, 2, np.nan]) + expected = Series(["a", "b", "c"]) + result = s.map({1: "a", 2: "b", np.nan: "c"}) + tm.assert_series_equal(result, expected) - def test_map_abc_mapping(self, non_dict_mapping_subclass): - # https://github.com/pandas-dev/pandas/issues/29733 - # Check collections.abc.Mapping support as mapper for Series.map - s = Series([1, 2, 3]) - not_a_dictionary = non_dict_mapping_subclass({3: "three"}) - result = s.map(not_a_dictionary) - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) - def test_map_abc_mapping_with_missing(self, non_dict_mapping_subclass): - # https://github.com/pandas-dev/pandas/issues/29733 - # Check collections.abc.Mapping support as mapper for Series.map - class NonDictMappingWithMissing(non_dict_mapping_subclass): - def __missing__(self, key): - return "missing" - - s = Series([1, 2, 3]) - not_a_dictionary = NonDictMappingWithMissing({3: "three"}) - result = s.map(not_a_dictionary) - # __missing__ is a dict concept, not a Mapping concept, - # so it should not change the result! - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) +def test_map_dict_subclass_with_missing(): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ - def test_map_box(self): - vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) - tm.assert_series_equal(res, exp) - - vals = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) - tm.assert_series_equal(res, exp) - - # timedelta - vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = Series(["Timedelta_1", "Timedelta_2"]) - tm.assert_series_equal(res, exp) - - # period - vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = Series(["Period_M", "Period_M"]) - tm.assert_series_equal(res, exp) - - def test_map_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - s = Series(values, name="XX", index=list("abcdefg")) - - result = s.map(lambda x: x.lower()) - exp_values = pd.Categorical( - list("abbabcd"), categories=list("dcba"), ordered=True - ) - exp = Series(exp_values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp_values) + class DictWithMissing(dict): + def __missing__(self, key): + return "missing" - result = s.map(lambda x: "A") - exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == object + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: "three"}) + result = s.map(dictionary) + expected = Series(["missing", "missing", "three"]) + tm.assert_series_equal(result, expected) - with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): - s.map(lambda x: x, na_action="ignore") - def test_map_datetimetz(self): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) - s = Series(values, name="XX") +def test_map_dict_subclass_without_missing(): + class DictWithoutMissing(dict): + pass - # keep tz - result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( - "Asia/Tokyo" - ) - exp = Series(exp_values, name="XX") - tm.assert_series_equal(result, exp) - - # change dtype - # GH 14506 : Returned dtype changed from int32 to int64 - result = s.map(lambda x: x.hour) - exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) - tm.assert_series_equal(result, exp) - - with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): - s.map(lambda x: x, na_action="ignore") - - # not vectorized - def f(x): - if not isinstance(x, pd.Timestamp): - raise ValueError - return str(x.tz) - - result = s.map(f) - exp = Series(["Asia/Tokyo"] * 25, name="XX") - tm.assert_series_equal(result, exp) - - @pytest.mark.parametrize( - "vals,mapping,exp", - [ - (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), - (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), - (list(range(3)), {0: 42}, [42] + [np.nan] * 3), - ], + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: "three"}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_dict_mapping_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping_with_missing(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_dict_mapping_subclass): + def __missing__(key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_box(): + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +def test_map_categorical(): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = Series(values, name="XX", index=list("abcdefg")) + + result = s.map(lambda x: x.lower()) + exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(exp_values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp_values) + + result = s.map(lambda x: "A") + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object + + with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): + s.map(lambda x: x, na_action="ignore") + + +def test_map_datetimetz(): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" ) - def test_map_missing_mixed(self, vals, mapping, exp): - # GH20495 - s = Series(vals + [np.nan]) - result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) - - @pytest.mark.parametrize( - "dti,exp", - [ - ( - Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), - DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), - ), - ( - tm.makeTimeSeries(nper=30), - DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), - ), - ], + s = Series(values, name="XX") + + # keep tz + result = s.map(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" ) - @pytest.mark.parametrize("aware", [True, False]) - def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware): - # GH 25959 - # Calling apply on a localized time series should not cause an error - if aware: - index = dti.tz_localize("UTC").index - else: - index = dti.index - result = Series(index).apply(lambda x: Series([1, 2])) - tm.assert_frame_equal(result, exp) - - def test_apply_scaler_on_date_time_index_aware_series(self): - # GH 25959 - # Calling apply on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize("UTC") - result = Series(series.index).apply(lambda x: 1) - tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) - - def test_map_float_to_string_precision(self): - # GH 13228 - ser = Series(1 / 3) - result = ser.map(lambda val: str(val)).to_dict() - expected = {0: "0.3333333333333333"} - assert result == expected + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.map(lambda x: x.hour) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): + s.map(lambda x: x, na_action="ignore") + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) - def test_map_with_invalid_na_action_raises(self): - # https://github.com/pandas-dev/pandas/issues/32815 - s = Series([1, 2, 3]) - msg = "na_action must either be 'ignore' or None" - with pytest.raises(ValueError, match=msg): - s.map(lambda x: x, na_action="____") - - def test_apply_to_timedelta(self): - list_of_valid_strings = ["00:00:01", "00:00:02"] - a = pd.to_timedelta(list_of_valid_strings) - b = Series(list_of_valid_strings).apply(pd.to_timedelta) - # FIXME: dont leave commented-out - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] - - a = pd.to_timedelta(list_of_strings) # noqa - b = Series(list_of_strings).apply(pd.to_timedelta) # noqa - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) + +@pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], +) +def test_map_missing_mixed(vals, mapping, exp): + # GH20495 + s = Series(vals + [np.nan]) + result = s.map(mapping) + + tm.assert_series_equal(result, Series(exp)) + + +@pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + tm.makeTimeSeries(nper=30), + DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + ), + ], +) +@pytest.mark.parametrize("aware", [True, False]) +def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): + # GH 25959 + # Calling apply on a localized time series should not cause an error + if aware: + index = dti.tz_localize("UTC").index + else: + index = dti.index + result = Series(index).apply(lambda x: Series([1, 2])) + tm.assert_frame_equal(result, exp) + + +def test_apply_scaler_on_date_time_index_aware_series(): + # GH 25959 + # Calling apply on a localized time series should not cause an error + series = tm.makeTimeSeries(nper=30).tz_localize("UTC") + result = Series(series.index).apply(lambda x: 1) + tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) + + +def test_map_float_to_string_precision(): + # GH 13228 + ser = Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected + + +def test_map_with_invalid_na_action_raises(): + # https://github.com/pandas-dev/pandas/issues/32815 + s = Series([1, 2, 3]) + msg = "na_action must either be 'ignore' or None" + with pytest.raises(ValueError, match=msg): + s.map(lambda x: x, na_action="____") + + +def test_apply_to_timedelta(): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(pd.to_timedelta) + # FIXME: dont leave commented-out + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) # noqa + b = Series(list_of_strings).apply(pd.to_timedelta) # noqa + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) @pytest.mark.parametrize( diff --git a/pandas/tests/apply/test_series_apply_relabeling.py b/pandas/tests/apply/test_series_apply_relabeling.py index 0b8d2c4e1f26d..c0a285e6eb38c 100644 --- a/pandas/tests/apply/test_series_apply_relabeling.py +++ b/pandas/tests/apply/test_series_apply_relabeling.py @@ -2,32 +2,32 @@ import pandas._testing as tm -class TestNamedAggregation: - def test_relabel_no_duplicated_method(self): - # this is to test there is no duplicated method used in agg - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df["A"].agg(foo="sum") - expected = df["A"].agg({"foo": "sum"}) - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo="min", bar="max") - expected = df["B"].agg({"foo": "min", "bar": "max"}) - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo=sum, bar=min, cat="max") - expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) - tm.assert_series_equal(result, expected) - - def test_relabel_duplicated_method(self): - # this is to test with nested renaming, duplicated method can be used - # if they are assigned with different new names - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df["A"].agg(foo="sum", bar="sum") - expected = pd.Series([6, 6], index=["foo", "bar"], name="A") - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo=min, bar="min") - expected = pd.Series([1, 1], index=["foo", "bar"], name="B") - tm.assert_series_equal(result, expected) +def test_relabel_no_duplicated_method(): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + +def test_relabel_duplicated_method(): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected)