BUG/REF: use sorted_rank_1d for rank_2d (#41931)

mzeitlin11 · web-flow · commit 7a38d63baaf9 · 2021-06-25T13:36:29.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -137,7 +137,8 @@ Timezones
 
 Numeric
 ^^^^^^^
--
+- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
+- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
 -
 
 Conversion
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -1372,26 +1372,29 @@ def rank_2d(
     Fast NaN-friendly version of ``scipy.stats.rankdata``.
     """
     cdef:
-        Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
-        Py_ssize_t infs
-        ndarray[float64_t, ndim=2] ranks
+        Py_ssize_t k, n, col
+        float64_t[::1, :] out  # Column-major so columns are contiguous
+        int64_t[::1, :] grp_sizes
+        const intp_t[:] labels
         ndarray[rank_t, ndim=2] values
-        ndarray[intp_t, ndim=2] argsort_indexer
-        ndarray[uint8_t, ndim=2] mask
-        rank_t val, nan_fill_val
-        float64_t count, sum_ranks = 0.0
-        int tiebreak = 0
-        int64_t idx
-        bint check_mask, condition, keep_na, nans_rank_highest
+        rank_t[:, :] masked_vals
+        intp_t[:, :] sort_indexer
+        uint8_t[:, :] mask
+        TiebreakEnumType tiebreak
+        bint check_mask, keep_na, nans_rank_highest
+        rank_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
+    if tiebreak == TIEBREAK_FIRST:
+        if not ascending:
+            tiebreak = TIEBREAK_FIRST_DESCENDING
 
     keep_na = na_option == 'keep'
 
     # For cases where a mask is not possible, we can avoid mask checks
     check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
 
-    if axis == 0:
+    if axis == 1:
         values = np.asarray(in_arr).T.copy()
     else:
         values = np.asarray(in_arr).copy()
@@ -1403,99 +1406,62 @@ def rank_2d(
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
         nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
+
         if rank_t is object:
-            mask = missing.isnaobj2d(values)
+            mask = missing.isnaobj2d(values).view(np.uint8)
         elif rank_t is float64_t:
-            mask = np.isnan(values)
+            mask = np.isnan(values).view(np.uint8)
 
         # int64 and datetimelike
         else:
-            mask = values == NPY_NAT
-
+            mask = (values == NPY_NAT).view(np.uint8)
         np.putmask(values, mask, nan_fill_val)
     else:
-        mask = np.zeros_like(values, dtype=bool)
+        mask = np.zeros_like(values, dtype=np.uint8)
+
+    if nans_rank_highest:
+        order = (values, mask)
+    else:
+        order = (values, ~np.asarray(mask))
 
     n, k = (<object>values).shape
-    ranks = np.empty((n, k), dtype='f8')
+    out = np.empty((n, k), dtype='f8', order='F')
+    grp_sizes = np.ones((n, k), dtype='i8', order='F')
+    labels = np.zeros(n, dtype=np.intp)
 
-    if tiebreak == TIEBREAK_FIRST:
-        # need to use a stable sort here
-        argsort_indexer = values.argsort(axis=1, kind='mergesort')
-        if not ascending:
-            tiebreak = TIEBREAK_FIRST_DESCENDING
+    # lexsort is slower, so only use if we need to worry about the mask
+    if check_mask:
+        sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
     else:
-        argsort_indexer = values.argsort(1)
+        kind = "stable" if ties_method == "first" else None
+        sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
 
     if not ascending:
-        argsort_indexer = argsort_indexer[:, ::-1]
-
-    values = _take_2d(values, argsort_indexer)
+        sort_indexer = sort_indexer[::-1, :]
 
-    for i in range(n):
-        dups = sum_ranks = infs = 0
-
-        total_tie_count = 0
-        count = 0.0
-        for j in range(k):
-            val = values[i, j]
-            idx = argsort_indexer[i, j]
-            if keep_na and check_mask and mask[i, idx]:
-                ranks[i, idx] = NaN
-                infs += 1
-                continue
-
-            count += 1.0
-
-            sum_ranks += (j - infs) + 1
-            dups += 1
-
-            if rank_t is object:
-                condition = (
-                    j == k - 1 or
-                    are_diff(values[i, j + 1], val) or
-                    (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
-                )
-            else:
-                condition = (
-                    j == k - 1 or
-                    values[i, j + 1] != val or
-                    (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
-                )
-
-            if condition:
-                if tiebreak == TIEBREAK_AVERAGE:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
-                elif tiebreak == TIEBREAK_MIN:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = j - dups + 2
-                elif tiebreak == TIEBREAK_MAX:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = j + 1
-                elif tiebreak == TIEBREAK_FIRST:
-                    if rank_t is object:
-                        raise ValueError('first not supported for non-numeric data')
-                    else:
-                        for z in range(j - dups + 1, j + 1):
-                            ranks[i, argsort_indexer[i, z]] = z + 1
-                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
-                elif tiebreak == TIEBREAK_DENSE:
-                    total_tie_count += 1
-                    for z in range(j - dups + 1, j + 1):
-                        ranks[i, argsort_indexer[i, z]] = total_tie_count
-                sum_ranks = dups = 0
-        if pct:
-            if tiebreak == TIEBREAK_DENSE:
-                ranks[i, :] /= total_tie_count
-            else:
-                ranks[i, :] /= count
-    if axis == 0:
-        return ranks.T
+    # putmask doesn't accept a memoryview, so we assign in a separate step
+    masked_vals = values
+    with nogil:
+        for col in range(k):
+            rank_sorted_1d(
+                out[:, col],
+                grp_sizes[:, col],
+                labels,
+                sort_indexer[:, col],
+                masked_vals[:, col],
+                mask[:, col],
+                tiebreak,
+                check_mask,
+                False,
+                keep_na,
+                pct,
+                n,
+            )
+
+    if axis == 1:
+        return np.asarray(out.T)
     else:
-        return ranks
+        return np.asarray(out)
 
 
 ctypedef fused diff_t:
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
@@ -219,33 +219,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
                     {{endif}}
 
 {{endfor}}
-
-# ----------------------------------------------------------------------
-# take_2d internal function
-# ----------------------------------------------------------------------
-
-ctypedef fused take_t:
-    float64_t
-    uint64_t
-    int64_t
-    object
-
-
-cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
-    cdef:
-        Py_ssize_t i, j, N, K
-        ndarray[intp_t, ndim=2, cast=True] indexer = idx
-        ndarray[take_t, ndim=2] result
-
-    N, K = (<object>values).shape
-
-    if take_t is object:
-        # evaluated at compile-time
-        result = values.copy()
-    else:
-        result = np.empty_like(values)
-
-    for i in range(N):
-        for j in range(K):
-            result[i, j] = values[i, indexer[i, j]]
-    return result
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -246,23 +246,18 @@ def test_rank_methods_frame(self):
                     expected = DataFrame(sprank, columns=cols).astype("float64")
                     tm.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
     @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
     def test_rank_descending(self, method, dtype):
-
         if "i" in dtype:
-            df = self.df.dropna()
+            df = self.df.dropna().astype(dtype)
         else:
             df = self.df.astype(dtype)
 
         res = df.rank(ascending=False)
         expected = (df.max() - df).rank()
         tm.assert_frame_equal(res, expected)
 
-        if method == "first" and dtype == "O":
-            return
-
         expected = (df.max() - df).rank(method=method)
 
         if dtype != "O":
@@ -287,9 +282,6 @@ def _check2d(df, expected, method="average", axis=0):
             result = df.rank(method=method, axis=axis)
             tm.assert_frame_equal(result, exp_df)
 
-        disabled = {(object, "first")}
-        if (dtype, method) in disabled:
-            return
         frame = df if dtype is None else df.astype(dtype)
         _check2d(frame, self.results[method], method=method, axis=axis)
 
@@ -456,6 +448,38 @@ def test_rank_both_inf(self):
         result = df.rank()
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "na_option,ascending,expected",
+        [
+            ("top", True, [3.0, 1.0, 2.0]),
+            ("top", False, [2.0, 1.0, 3.0]),
+            ("bottom", True, [2.0, 3.0, 1.0]),
+            ("bottom", False, [1.0, 3.0, 2.0]),
+        ],
+    )
+    def test_rank_inf_nans_na_option(
+        self, frame_or_series, method, na_option, ascending, expected
+    ):
+        obj = frame_or_series([np.inf, np.nan, -np.inf])
+        result = obj.rank(method=method, na_option=na_option, ascending=ascending)
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "na_option,ascending,expected",
+        [
+            ("bottom", True, [1.0, 2.0, 4.0, 3.0]),
+            ("bottom", False, [1.0, 2.0, 4.0, 3.0]),
+            ("top", True, [2.0, 3.0, 1.0, 4.0]),
+            ("top", False, [2.0, 3.0, 1.0, 4.0]),
+        ],
+    )
+    def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
+        obj = frame_or_series(["foo", "foo", None, "foo"])
+        result = obj.rank(method="first", na_option=na_option, ascending=ascending)
+        expected = frame_or_series(expected)
+        tm.assert_equal(result, expected)
+
     @pytest.mark.parametrize(
         "data,expected",
         [

Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,8 @@ Timezones`
`137`	`137`
`138`	`138`	`Numeric`
`139`	`139`	`^^^^^^^`
`140`		`--`
	`140`	+- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
	`141`	+- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
`141`	`142`	`-`
`142`	`143`
`143`	`144`	`Conversion`