@@ -1372,26 +1372,29 @@ def rank_2d(
13721372 Fast NaN-friendly version of ``scipy.stats.rankdata``.
13731373 """
13741374 cdef:
1375- Py_ssize_t i, j, z, k, n, dups = 0 , total_tie_count = 0
1376- Py_ssize_t infs
1377- ndarray[float64_t, ndim= 2 ] ranks
1375+ Py_ssize_t k, n, col
1376+ float64_t[::1 , :] out # Column-major so columns are contiguous
1377+ int64_t[::1 , :] grp_sizes
1378+ const intp_t[:] labels
13781379 ndarray[rank_t, ndim= 2 ] values
1379- ndarray[intp_t, ndim= 2 ] argsort_indexer
1380- ndarray[uint8_t, ndim= 2 ] mask
1381- rank_t val, nan_fill_val
1382- float64_t count, sum_ranks = 0.0
1383- int tiebreak = 0
1384- int64_t idx
1385- bint check_mask, condition, keep_na, nans_rank_highest
1380+ rank_t[:, :] masked_vals
1381+ intp_t[:, :] sort_indexer
1382+ uint8_t[:, :] mask
1383+ TiebreakEnumType tiebreak
1384+ bint check_mask, keep_na, nans_rank_highest
1385+ rank_t nan_fill_val
13861386
13871387 tiebreak = tiebreakers[ties_method]
1388+ if tiebreak == TIEBREAK_FIRST:
1389+ if not ascending:
1390+ tiebreak = TIEBREAK_FIRST_DESCENDING
13881391
13891392 keep_na = na_option == ' keep'
13901393
13911394 # For cases where a mask is not possible, we can avoid mask checks
13921395 check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))
13931396
1394- if axis == 0 :
1397+ if axis == 1 :
13951398 values = np.asarray(in_arr).T.copy()
13961399 else :
13971400 values = np.asarray(in_arr).copy()
@@ -1403,99 +1406,62 @@ def rank_2d(
14031406 nans_rank_highest = ascending ^ (na_option == ' top' )
14041407 if check_mask:
14051408 nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
1409+
14061410 if rank_t is object :
1407- mask = missing.isnaobj2d(values)
1411+ mask = missing.isnaobj2d(values).view(np.uint8)
14081412 elif rank_t is float64_t:
1409- mask = np.isnan(values)
1413+ mask = np.isnan(values).view(np.uint8)
14101414
14111415 # int64 and datetimelike
14121416 else :
1413- mask = values == NPY_NAT
1414-
1417+ mask = (values == NPY_NAT).view(np.uint8)
14151418 np.putmask(values, mask, nan_fill_val)
14161419 else :
1417- mask = np.zeros_like(values, dtype = bool )
1420+ mask = np.zeros_like(values, dtype = np.uint8)
1421+
1422+ if nans_rank_highest:
1423+ order = (values, mask)
1424+ else :
1425+ order = (values, ~ np.asarray(mask))
14181426
14191427 n, k = (< object > values).shape
1420- ranks = np.empty((n, k), dtype = ' f8' )
1428+ out = np.empty((n, k), dtype = ' f8' , order = ' F' )
1429+ grp_sizes = np.ones((n, k), dtype = ' i8' , order = ' F' )
1430+ labels = np.zeros(n, dtype = np.intp)
14211431
1422- if tiebreak == TIEBREAK_FIRST:
1423- # need to use a stable sort here
1424- argsort_indexer = values.argsort(axis = 1 , kind = ' mergesort' )
1425- if not ascending:
1426- tiebreak = TIEBREAK_FIRST_DESCENDING
1432+ # lexsort is slower, so only use if we need to worry about the mask
1433+ if check_mask:
1434+ sort_indexer = np.lexsort(order, axis = 0 ).astype(np.intp, copy = False )
14271435 else :
1428- argsort_indexer = values.argsort(1 )
1436+ kind = " stable" if ties_method == " first" else None
1437+ sort_indexer = values.argsort(axis = 0 , kind = kind).astype(np.intp, copy = False )
14291438
14301439 if not ascending:
1431- argsort_indexer = argsort_indexer[:, ::- 1 ]
1432-
1433- values = _take_2d(values, argsort_indexer)
1440+ sort_indexer = sort_indexer[::- 1 , :]
14341441
1435- for i in range (n):
1436- dups = sum_ranks = infs = 0
1437-
1438- total_tie_count = 0
1439- count = 0.0
1440- for j in range (k):
1441- val = values[i, j]
1442- idx = argsort_indexer[i, j]
1443- if keep_na and check_mask and mask[i, idx]:
1444- ranks[i, idx] = NaN
1445- infs += 1
1446- continue
1447-
1448- count += 1.0
1449-
1450- sum_ranks += (j - infs) + 1
1451- dups += 1
1452-
1453- if rank_t is object :
1454- condition = (
1455- j == k - 1 or
1456- are_diff(values[i, j + 1 ], val) or
1457- (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1 ]])
1458- )
1459- else :
1460- condition = (
1461- j == k - 1 or
1462- values[i, j + 1 ] != val or
1463- (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1 ]])
1464- )
1465-
1466- if condition:
1467- if tiebreak == TIEBREAK_AVERAGE:
1468- for z in range (j - dups + 1 , j + 1 ):
1469- ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
1470- elif tiebreak == TIEBREAK_MIN:
1471- for z in range (j - dups + 1 , j + 1 ):
1472- ranks[i, argsort_indexer[i, z]] = j - dups + 2
1473- elif tiebreak == TIEBREAK_MAX:
1474- for z in range (j - dups + 1 , j + 1 ):
1475- ranks[i, argsort_indexer[i, z]] = j + 1
1476- elif tiebreak == TIEBREAK_FIRST:
1477- if rank_t is object :
1478- raise ValueError (' first not supported for non-numeric data' )
1479- else :
1480- for z in range (j - dups + 1 , j + 1 ):
1481- ranks[i, argsort_indexer[i, z]] = z + 1
1482- elif tiebreak == TIEBREAK_FIRST_DESCENDING:
1483- for z in range (j - dups + 1 , j + 1 ):
1484- ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
1485- elif tiebreak == TIEBREAK_DENSE:
1486- total_tie_count += 1
1487- for z in range (j - dups + 1 , j + 1 ):
1488- ranks[i, argsort_indexer[i, z]] = total_tie_count
1489- sum_ranks = dups = 0
1490- if pct:
1491- if tiebreak == TIEBREAK_DENSE:
1492- ranks[i, :] /= total_tie_count
1493- else :
1494- ranks[i, :] /= count
1495- if axis == 0 :
1496- return ranks.T
1442+ # putmask doesn't accept a memoryview, so we assign in a separate step
1443+ masked_vals = values
1444+ with nogil:
1445+ for col in range (k):
1446+ rank_sorted_1d(
1447+ out[:, col],
1448+ grp_sizes[:, col],
1449+ labels,
1450+ sort_indexer[:, col],
1451+ masked_vals[:, col],
1452+ mask[:, col],
1453+ tiebreak,
1454+ check_mask,
1455+ False ,
1456+ keep_na,
1457+ pct,
1458+ n,
1459+ )
1460+
1461+ if axis == 1 :
1462+ return np.asarray(out.T)
14971463 else :
1498- return ranks
1464+ return np.asarray(out)
14991465
15001466
15011467ctypedef fused diff_t:
0 commit comments