From 230c67be0fd5a6781ec4f8d71f8374dd37e3868c Mon Sep 17 00:00:00 2001 From: Siddhesh Thakur Date: Tue, 10 Jun 2025 13:51:12 -0400 Subject: [PATCH] Update ranker.py fixing bug for extremes --- pyranker/ranker.py | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/pyranker/ranker.py b/pyranker/ranker.py index 355ce4d..7b28354 100644 --- a/pyranker/ranker.py +++ b/pyranker/ranker.py @@ -136,45 +136,47 @@ def perform_permutation_test(self) -> None: ) for i in tqdm(range(n_methods), desc="Permutation test"): - # calculate for unique pairs (i < j) for j in range(i + 1, n_methods): # get the ranks for the two methods - method_i = ranks_per_metric_sanitized.iloc[i] - method_j = ranks_per_metric_sanitized.iloc[j] - arr_i = method_i.to_numpy() - arr_j = method_j.to_numpy() - # # calculate the difference in ranks - diff_ranks = arr_i.sum() - arr_j.sum() - # initialize an array to store the differences - diff_greater = np.zeros(self.n_iterations) + arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy() + arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy() + + # BUG FIX: Use the absolute difference for a two-sided test + observed_diff = abs(arr_i.sum() - arr_j.sum()) + + count_extreme = 0 # perform the permutation test for it in range(self.n_iterations): - # generate a random permutation - r = np.random.randint(0, 2, arr_i.shape) + # generate a random permutation mask + r = np.random.randint(0, 2, size=arr_i.shape, dtype=bool) # create a copy of the ranks arr1_rand = arr_i.copy() arr2_rand = arr_j.copy() # swap the ranks based on the random permutation - arr1_rand[r == 1], arr2_rand[r == 1] = ( - arr_j[r == 1], - arr_i[r == 1], - ) + # Note: Using boolean indexing is cleaner and often faster + arr1_rand[r], arr2_rand[r] = arr_j[r], arr_i[r] # calculate the difference in ranks for the random permutation - diff_ranks_rand = arr1_rand.sum() - arr2_rand.sum() - # store the difference if it is greater than the actual difference - if diff_ranks_rand < diff_ranks: - diff_greater[it] = 1 + permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum()) + + # BUG FIX: Check if the permuted difference is as or more extreme + if permuted_diff >= observed_diff: + count_extreme += 1 # calculate the p-value - self.pvals[i, j] = diff_greater.sum() / self.n_iterations + pval = count_extreme / self.n_iterations + self.pvals[i, j] = pval + # The p-value is symmetric + self.pvals[j, i] = pval # create a dataframe from the pvals self.pvals_df = pd.DataFrame( - self.pvals, columns=self.ranks_per_metric["method"] + self.pvals, + columns=self.ranks_per_metric["method"], + index=self.ranks_per_metric["method"], ) self.pvals_df["method"] = self.ranks_per_metric["method"].tolist() self.pvals_df = self.pvals_df.set_index("method")