From 230c67be0fd5a6781ec4f8d71f8374dd37e3868c Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Tue, 10 Jun 2025 13:51:12 -0400
Subject: [PATCH 01/11] Update ranker.py fixing bug for extremes

---
 pyranker/ranker.py | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 355ce4d..7b28354 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -136,45 +136,47 @@ def perform_permutation_test(self) -> None:
         )
 
         for i in tqdm(range(n_methods), desc="Permutation test"):
-            # calculate for unique pairs (i < j)
             for j in range(i + 1, n_methods):
                 # get the ranks for the two methods
-                method_i = ranks_per_metric_sanitized.iloc[i]
-                method_j = ranks_per_metric_sanitized.iloc[j]
-                arr_i = method_i.to_numpy()
-                arr_j = method_j.to_numpy()
-                # # calculate the difference in ranks
-                diff_ranks = arr_i.sum() - arr_j.sum()
-                # initialize an array to store the differences
-                diff_greater = np.zeros(self.n_iterations)
+                arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
+                arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
+                
+                # BUG FIX: Use the absolute difference for a two-sided test
+                observed_diff = abs(arr_i.sum() - arr_j.sum())
+                
+                count_extreme = 0
 
                 # perform the permutation test
                 for it in range(self.n_iterations):
-                    # generate a random permutation
-                    r = np.random.randint(0, 2, arr_i.shape)
+                    # generate a random permutation mask
+                    r = np.random.randint(0, 2, size=arr_i.shape, dtype=bool)
 
                     # create a copy of the ranks
                     arr1_rand = arr_i.copy()
                     arr2_rand = arr_j.copy()
 
                     # swap the ranks based on the random permutation
-                    arr1_rand[r == 1], arr2_rand[r == 1] = (
-                        arr_j[r == 1],
-                        arr_i[r == 1],
-                    )
+                    # Note: Using boolean indexing is cleaner and often faster
+                    arr1_rand[r], arr2_rand[r] = arr_j[r], arr_i[r]
 
                     # calculate the difference in ranks for the random permutation
-                    diff_ranks_rand = arr1_rand.sum() - arr2_rand.sum()
-                    # store the difference if it is greater than the actual difference
-                    if diff_ranks_rand < diff_ranks:
-                        diff_greater[it] = 1
+                    permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum())
+                    
+                    # BUG FIX: Check if the permuted difference is as or more extreme
+                    if permuted_diff >= observed_diff:
+                        count_extreme += 1
 
                 # calculate the p-value
-                self.pvals[i, j] = diff_greater.sum() / self.n_iterations
+                pval = count_extreme / self.n_iterations
+                self.pvals[i, j] = pval
+                # The p-value is symmetric
+                self.pvals[j, i] = pval
 
         # create a dataframe from the pvals
         self.pvals_df = pd.DataFrame(
-            self.pvals, columns=self.ranks_per_metric["method"]
+            self.pvals,
+            columns=self.ranks_per_metric["method"],
+            index=self.ranks_per_metric["method"],
         )
         self.pvals_df["method"] = self.ranks_per_metric["method"].tolist()
         self.pvals_df = self.pvals_df.set_index("method")

From 8daf191eeb9f9c99830c73fa3600849d47269b8e Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Fri, 15 Aug 2025 17:02:58 -0400
Subject: [PATCH 02/11] added final ranking save to csv first

---
 pyranker/ranker.py | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 6bfc27e..7cca551 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Tuple
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 from tqdm import tqdm
 
 
@@ -28,6 +29,7 @@ def __init__(
 
         # dict of lists with the key being ${metric}_${subjectid} and value being the list of scores
         self.combined_scores_per_subject = {}
+        self.participant_ranks = {}
 
         self.combine_csvs_and_scores()
 
@@ -55,6 +57,12 @@ def combine_csvs_and_scores(self) -> None:
             metrics_columns.sort()
             current_df = current_df[["subjectid"] + metrics_columns]
 
+            # calculate ranks for each metric
+            for metric in metrics_columns:
+                current_df[metric] = current_df[metric].rank(
+                    method=self.ranking_method, ascending=False
+                )
+
             # convert to a single row df with unique column names based on subjectid column
             current_df_flattened = {"method": method}
             for _, row in current_df.iterrows():
@@ -74,11 +82,11 @@ def combine_csvs_and_scores(self) -> None:
 
     def rank_methods(self) -> None:
         """
-        Rank the methods based on the metrics.
+        Rank the methods based on each metric and calculate the combined final rank.
         """
         # calculate rank per metric
         self.ranks_per_metric = self.metrics_per_subject.rank(
-            method=self.ranking_method, ascending=False, numeric_only=True
+            method=self.ranking_method, ascending=True, numeric_only=True
         )
 
         # ensure all metrics are lowercase to avoid case sensitivity
@@ -94,10 +102,8 @@ def rank_methods(self) -> None:
                         - self.ranks_per_metric[column]
                     )
 
-        # calculate cumulative rank by summing the ranks of all metrics and dividing by the number of metrics
-        cumulative_rank_column = self.ranks_per_metric.sum(axis=1) / len(
-            self.ranks_per_metric.columns
-        )
+        # calculate cumulative rank by summing the ranks of all metrics
+        cumulative_rank_column = self.ranks_per_metric.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
         )
@@ -145,10 +151,10 @@ def perform_permutation_test(self) -> None:
                 # get the ranks for the two methods
                 arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
                 arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
-                
+
                 # BUG FIX: Use the absolute difference for a two-sided test
                 observed_diff = abs(arr_i.sum() - arr_j.sum())
-                
+
                 count_extreme = 0
 
                 # perform the permutation test
@@ -166,14 +172,14 @@ def perform_permutation_test(self) -> None:
 
                     # calculate the difference in ranks for the random permutation
                     permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum())
-                    
+
                     # BUG FIX: Check if the permuted difference is as or more extreme
                     if permuted_diff >= observed_diff:
                         count_extreme += 1
 
                 # Check if count_extreme is still zero, which would create pval=0.
                 # A p-value of 0 implies absolute certainty, which is unrealistic given the finite
-                # number of permutations. To avoid this, we adjust count_extreme to ensure a 
+                # number of permutations. To avoid this, we adjust count_extreme to ensure a
                 # conservative estimate of the p-value, aligning with standard statistical practices.
                 if count_extreme == 0:
                     count_extreme += 1
@@ -199,4 +205,11 @@ def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         Returns:
             Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the rankings and p-values dataframes.
         """
-        return self.ranks_per_metric, self.pvals_df
+        # sort ranks by final_rank
+        ranks_df = self.ranks_per_metric.sort_values(by="final_rank").reset_index(
+            drop=True
+        )
+        # sort pvals by the same order as ranks
+        pvals_df = self.pvals_df.reindex(index=ranks_df["method"], columns=ranks_df["method"])
+
+        return ranks_df, pvals_df

From 0fec488adbc49c352a222e3e366e64eb9b2b85ef Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Mon, 18 Aug 2025 13:41:50 -0400
Subject: [PATCH 03/11] formatting

---
 pyranker/ranker.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 7cca551..5fa4ad1 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -210,6 +210,8 @@ def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
             drop=True
         )
         # sort pvals by the same order as ranks
-        pvals_df = self.pvals_df.reindex(index=ranks_df["method"], columns=ranks_df["method"])
+        pvals_df = self.pvals_df.reindex(
+            index=ranks_df["method"], columns=ranks_df["method"]
+        )
 
         return ranks_df, pvals_df

From 46d788ec14e1de8ec35293c93a6115296a0f9eed Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Mon, 18 Aug 2025 15:02:50 -0400
Subject: [PATCH 04/11] might or might not work

---
 pyranker/cli/run.py |  23 +++++-
 pyranker/ranker.py  | 176 +++++++++++++++++++++++++-------------------
 2 files changed, 121 insertions(+), 78 deletions(-)

diff --git a/pyranker/cli/run.py b/pyranker/cli/run.py
index 5d11c58..244f92a 100644
--- a/pyranker/cli/run.py
+++ b/pyranker/cli/run.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import Optional
 
@@ -168,7 +169,7 @@ def main(
             "--iterations",
             help="The number of iterations to perform for the permutation test.",
         ),
-    ] = 1000,
+    ] = 100000,
     ranking_method: Annotated[
         str,
         typer.Option(
@@ -177,6 +178,14 @@ def main(
             help="The method to use for ranking the methods; one of 'average', 'min', 'max', 'first', 'dense'.",
         ),
     ] = "average",
+    n_jobs: Annotated[
+        int,
+        typer.Option(
+            "-j",
+            "--n-jobs",
+            help="The number of CPU cores to use for parallel processing.",
+        ),
+    ] = 4,
     version: Annotated[
         Optional[bool],
         typer.Option(
@@ -195,9 +204,9 @@ def main(
     csvs_to_compare_with_full_path = get_csv_paths(input)
 
     # basic sanity checks
-    assert (
-        len(csvs_to_compare_with_full_path) > 1
-    ), "At least two methods are required for comparison"
+    assert len(csvs_to_compare_with_full_path) > 1, (
+        "At least two methods are required for comparison"
+    )
     ranking_method = ranking_method.lower()
     assert ranking_method in [
         "average",
@@ -208,6 +217,11 @@ def main(
     ], "Invalid ranking method"
     assert iterations > 0, "Number of iterations must be greater than 0"
 
+    # Assert that the number of jobs is not greater than the number of cores
+    assert n_jobs <= os.cpu_count(), (
+        "Number of jobs cannot be greater than the number of cores"
+    )
+
     # convert the metrics_for_reversal to a list
     metrics_for_reversal_list = (
         metrics_for_reversal.split(",") if metrics_for_reversal else []
@@ -227,6 +241,7 @@ def main(
         metrics_for_reversal=metrics_for_reversal_list,
         n_iterations=iterations,
         ranking_method=ranking_method,
+        n_jobs=n_jobs,
     )
     ranks, pvals = ranker.get_rankings_and_pvals()
     Path(outputdir).mkdir(parents=True, exist_ok=True)
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 5fa4ad1..bf1a58f 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -1,3 +1,5 @@
+import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -5,6 +7,57 @@
 from tqdm import tqdm
 
 
+# This worker function is defined at the top level so it can be pickled
+# and sent to other processes by the ProcessPoolExecutor.
+def _calculate_pval_for_pair(
+    arr_i: np.ndarray, arr_j: np.ndarray, n_iterations: int
+) -> float:
+    """
+    Performs the permutation test for a single pair of rank arrays.
+
+    Args:
+        arr_i (np.ndarray): Rank array for the first method.
+        arr_j (np.ndarray): Rank array for the second method.
+        n_iterations (int): The number of permutation iterations.
+
+    Returns:
+        float: The calculated p-value.
+    """
+    # Use the absolute difference for a two-sided test
+    observed_diff = abs(arr_i.sum() - arr_j.sum())
+    count_extreme = 0
+
+    # Create a local random number generator for thread-safety
+    rng = np.random.default_rng()
+
+    # Perform the permutation test
+    for _ in range(n_iterations):
+        # Generate a random permutation mask
+        r = rng.integers(0, 2, size=arr_i.shape, dtype=bool)
+
+        # Create a copy of the ranks
+        arr1_rand = arr_i.copy()
+        arr2_rand = arr_j.copy()
+
+        # Swap the ranks based on the random permutation
+        arr1_rand[r], arr2_rand[r] = arr_j[r], arr_i[r]
+
+        # Calculate the difference in ranks for the random permutation
+        permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum())
+
+        # Check if the permuted difference is as or more extreme
+        if permuted_diff >= observed_diff:
+            count_extreme += 1
+
+    # To avoid p-values of 0, which implies absolute certainty, we add 1
+    # if no permuted difference was more extreme than the observed one.
+    if count_extreme == 0:
+        count_extreme = 1
+
+    pval = count_extreme / n_iterations
+    return pval
+
+
 class Ranker:
     def __init__(
         self,
@@ -12,6 +65,7 @@ def __init__(
         metrics_for_reversal: List[str],
         n_iterations: int = 1000,
         ranking_method: str = "average",
+        n_jobs: int = 4,  # New argument for number of CPU cores
     ) -> None:
         """
         Ranker class to compare the scores of different methods.
@@ -21,13 +75,19 @@ def __init__(
             metrics_for_reversal (List[str]): The metrics for which the reversal should be calculated.
             n_iterations (int): The number of iterations to perform for the permutation test.
             ranking_method (str): The method to use for ranking the methods.
+            n_jobs (int): The number of CPU cores to use for parallel processing. Defaults to 4.
+                          Set to -1 to use all available cores.
         """
         self.input_csvs_to_compare = input_csvs_to_compare
         self.metrics_for_reversal = metrics_for_reversal
         self.n_iterations = n_iterations
         self.ranking_method = ranking_method
 
-        # dict of lists with the key being ${metric}_${subjectid} and value being the list of scores
+        if n_jobs == -1:
+            self.n_jobs = os.cpu_count()
+        else:
+            self.n_jobs = n_jobs
+
         self.combined_scores_per_subject = {}
         self.participant_ranks = {}
 
@@ -38,32 +98,24 @@ def combine_csvs_and_scores(self) -> None:
         Combine the CSVs and scores of the methods.
         """
         self.combined_scores_per_subject["method"] = []
-
-        # create a dataframe to store the metrics per subject
         self.metrics_per_subject = pd.DataFrame()
 
         for method in self.input_csvs_to_compare:
             self.combined_scores_per_subject["method"].append(method)
             current_df = pd.read_csv(self.input_csvs_to_compare[method])
-            # ensure all columns are lowercase to avoid case sensitivity
             current_df.columns = current_df.columns.str.lower()
-
-            # sort along subjectid column to ensure that metrics are in the same order
             current_df = current_df.sort_values(by="subjectid")
 
-            # sort metrics columns to ensure that metrics are in the same order
-            metrics_columns = current_df.columns.tolist()
-            metrics_columns.remove("subjectid")
-            metrics_columns.sort()
+            metrics_columns = sorted(
+                [col for col in current_df.columns if col != "subjectid"]
+            )
             current_df = current_df[["subjectid"] + metrics_columns]
 
-            # calculate ranks for each metric
             for metric in metrics_columns:
                 current_df[metric] = current_df[metric].rank(
                     method=self.ranking_method, ascending=False
                 )
 
-            # convert to a single row df with unique column names based on subjectid column
             current_df_flattened = {"method": method}
             for _, row in current_df.iterrows():
                 for metric in current_df.columns:
@@ -72,10 +124,11 @@ def combine_csvs_and_scores(self) -> None:
                             metric
                         ]
 
-            # convert to a dataframe and append to the metrics_per_subject dataframe
             current_df_flattened = pd.DataFrame(current_df_flattened, index=[0])
             self.metrics_per_subject = pd.concat(
-                [self.metrics_per_subject, current_df_flattened], axis=0
+                [self.metrics_per_subject, current_df_flattened],
+                axis=0,
+                ignore_index=True,
             )
 
         self.rank_methods()
@@ -84,15 +137,12 @@ def rank_methods(self) -> None:
         """
         Rank the methods based on each metric and calculate the combined final rank.
         """
-        # calculate rank per metric
         self.ranks_per_metric = self.metrics_per_subject.rank(
             method=self.ranking_method, ascending=True, numeric_only=True
         )
 
-        # ensure all metrics are lowercase to avoid case sensitivity
         metrics_for_reversal_lower = [x.lower() for x in self.metrics_for_reversal]
 
-        # reverse the ranks for the metrics that need reversal
         for metric in metrics_for_reversal_lower:
             for column in self.ranks_per_metric.columns:
                 if metric in column:
@@ -102,12 +152,11 @@ def rank_methods(self) -> None:
                         - self.ranks_per_metric[column]
                     )
 
-        # calculate cumulative rank by summing the ranks of all metrics
         cumulative_rank_column = self.ranks_per_metric.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
         )
-        # combine cumulative_rank_column, final_rank_column, and method column to the ranks_per_metric dataframe
+
         self.ranks_per_metric = pd.concat(
             [
                 self.ranks_per_metric,
@@ -118,7 +167,6 @@ def rank_methods(self) -> None:
             axis=1,
         )
 
-        # reorder columns to put method, final_rank, cumulative_rank in the beginning
         self.ranks_per_metric = self.ranks_per_metric[
             ["method", "final_rank", "cumulative_rank"]
             + [
@@ -132,12 +180,11 @@ def rank_methods(self) -> None:
 
     def perform_permutation_test(self) -> None:
         """
-        Perform permutation test to determine the significance of the ranks.
+        Perform permutation test in parallel to determine the significance of the ranks.
         """
         n_methods = len(self.ranks_per_metric)
         self.pvals = np.zeros((n_methods, n_methods))
 
-        # sort in order of cumulative rank and reset index in one step
         ranks_per_metric_sorted = self.ranks_per_metric.sort_values(
             by="cumulative_rank"
         ).reset_index(drop=True)
@@ -146,72 +193,53 @@ def perform_permutation_test(self) -> None:
             columns=["method", "cumulative_rank", "final_rank"]
         )
 
-        for i in tqdm(range(n_methods), desc="Permutation test"):
-            for j in range(i + 1, n_methods):
-                # get the ranks for the two methods
-                arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
-                arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
-
-                # BUG FIX: Use the absolute difference for a two-sided test
-                observed_diff = abs(arr_i.sum() - arr_j.sum())
-
-                count_extreme = 0
-
-                # perform the permutation test
-                for it in range(self.n_iterations):
-                    # generate a random permutation mask
-                    r = np.random.randint(0, 2, size=arr_i.shape, dtype=bool)
-
-                    # create a copy of the ranks
-                    arr1_rand = arr_i.copy()
-                    arr2_rand = arr_j.copy()
-
-                    # swap the ranks based on the random permutation
-                    # Note: Using boolean indexing is cleaner and often faster
-                    arr1_rand[r], arr2_rand[r] = arr_j[r], arr_i[r]
-
-                    # calculate the difference in ranks for the random permutation
-                    permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum())
-
-                    # BUG FIX: Check if the permuted difference is as or more extreme
-                    if permuted_diff >= observed_diff:
-                        count_extreme += 1
-
-                # Check if count_extreme is still zero, which would create pval=0.
-                # A p-value of 0 implies absolute certainty, which is unrealistic given the finite
-                # number of permutations. To avoid this, we adjust count_extreme to ensure a
-                # conservative estimate of the p-value, aligning with standard statistical practices.
-                if count_extreme == 0:
-                    count_extreme += 1
-                # calculate the p-value
-                pval = count_extreme / self.n_iterations
-                self.pvals[i, j] = pval
-                # The p-value is symmetric
-                self.pvals[j, i] = pval
-
-        # create a dataframe from the pvals
+        # Use ProcessPoolExecutor for parallel processing
+        with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
+            # A dictionary to map future objects to their matrix indices (i, j)
+            future_to_indices = {}
+            # Submit all pairs to the executor
+            for i in range(n_methods):
+                for j in range(i + 1, n_methods):
+                    arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
+                    arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
+                    # Submit the worker function with its arguments
+                    future = executor.submit(
+                        _calculate_pval_for_pair, arr_i, arr_j, self.n_iterations
+                    )
+                    future_to_indices[future] = (i, j)
+
+            # Create a progress bar that updates as tasks are completed
+            pbar = tqdm(
+                as_completed(future_to_indices),
+                total=len(future_to_indices),
+                desc="Permutation test",
+            )
+            for future in pbar:
+                i, j = future_to_indices[future]
+                try:
+                    pval = future.result()
+                    self.pvals[i, j] = pval
+                    self.pvals[j, i] = pval  # p-value is symmetric
+                except Exception as exc:
+                    print(f"Pair ({i}, {j}) generated an exception: {exc}")
+
         self.pvals_df = pd.DataFrame(
             self.pvals,
-            columns=self.ranks_per_metric["method"],
-            index=self.ranks_per_metric["method"],
+            columns=ranks_per_metric_sorted["method"],
+            index=ranks_per_metric_sorted["method"],
         )
-        self.pvals_df["method"] = self.ranks_per_metric["method"].tolist()
-        self.pvals_df = self.pvals_df.set_index("method")
 
     def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
-        Get the rankings of the methods.
+        Get the final rankings and p-values.
 
         Returns:
             Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the rankings and p-values dataframes.
         """
-        # sort ranks by final_rank
         ranks_df = self.ranks_per_metric.sort_values(by="final_rank").reset_index(
             drop=True
         )
-        # sort pvals by the same order as ranks
         pvals_df = self.pvals_df.reindex(
             index=ranks_df["method"], columns=ranks_df["method"]
         )
-
         return ranks_df, pvals_df

From fda95595c64d478854faf2510bcff0ae8388e322 Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Mon, 18 Aug 2025 18:10:19 -0400
Subject: [PATCH 05/11] fixed abs ranking for null hypothesis bug

---
 pyranker/ranker.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index bf1a58f..b801c23 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -23,8 +23,8 @@ def _calculate_pval_for_pair(
     Returns:
         float: The calculated p-value.
     """
-    # Use the absolute difference for a two-sided test
-    observed_diff = abs(arr_i.sum() - arr_j.sum())
+    # Use the difference for a one-sided test
+    observed_diff = arr_i.sum() - arr_j.sum()
     count_extreme = 0
 
     # Create a local random number generator for thread-safety
@@ -43,7 +43,7 @@ def _calculate_pval_for_pair(
         arr1_rand[r], arr2_rand[r] = arr_j[r], arr_i[r]
 
         # Calculate the difference in ranks for the random permutation
-        permuted_diff = abs(arr1_rand.sum() - arr2_rand.sum())
+        permuted_diff = arr1_rand.sum() - arr2_rand.sum()
 
         # Check if the permuted difference is as or more extreme
         if permuted_diff >= observed_diff:

From 244d54a51f47a0090c33ad04ab6c55af73872b29 Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Tue, 19 Aug 2025 11:20:09 -0400
Subject: [PATCH 06/11] need to fix the tests

---
 data/m1.csv                | 14 ++------
 data/m2.csv                | 14 ++------
 data/m3.csv                | 14 ++------
 data/m4.csv                | 14 ++------
 data/temp_output/pvals.csv |  5 +++
 data/temp_output/ranks.csv |  5 +++
 pyranker/cli/run.py        | 30 +++++++++++++++++
 pyranker/ranker.py         | 17 +++-------
 tests_full.py              | 68 ++++++++++++++++++++++++++++++++++----
 9 files changed, 118 insertions(+), 63 deletions(-)
 create mode 100644 data/temp_output/pvals.csv
 create mode 100644 data/temp_output/ranks.csv

diff --git a/data/m1.csv b/data/m1.csv
index e23342c..1dbbb7c 100644
--- a/data/m1.csv
+++ b/data/m1.csv
@@ -1,11 +1,3 @@
-SubjectID,A,B,C,D,E,F
-s001,-0.676662165,-1.406645477,0.736895876,-0.174272834,0.576927715,-0.232139845
-s002,-1.182135526,0.325161174,1.265839829,0.637533468,0.717606195,-0.232249719
-s003,0.393762147,-1.366917238,-1.974747205,-2.029359097,-0.91486706,-0.110356815
-s004,-0.560421215,-0.916606755,-0.244361005,0.173264029,-0.018263561,-1.112137106
-s005,-0.018074945,0.909978883,0.654103198,-0.412681032,0.415519864,0.415147598
-s006,0.584884843,-0.365552063,-0.125284377,0.420532768,1.048717925,-0.520722918
-s007,0.246445503,0.018436118,0.540072217,-0.059316335,-1.102092291,0.446401257
-s008,-0.78842192,-0.634175082,0.312935264,0.272096895,-0.151559698,-2.457860693
-s009,0.134775369,-0.241349035,0.711768614,-0.387514653,0.090663752,0.71284279
-s010,-0.96395775,-0.663571103,0.838443773,-0.933803671,-0.722117911,-0.189414521
+subjectid,A,B,C,D,E,F
+s1,1,2,3,4,5,6
+s2,7,8,9,10,11,12
\ No newline at end of file
diff --git a/data/m2.csv b/data/m2.csv
index 059996b..e93738f 100644
--- a/data/m2.csv
+++ b/data/m2.csv
@@ -1,11 +1,3 @@
-SubjectID,A,B,C,D,E,F
-s001,-0.371449174,0.956404946,-0.959452443,-0.309927689,0.905046916,0.819083005
-s002,0.935687942,0.109916076,-0.689643721,1.068025385,-1.154739305,-0.462448565
-s003,-0.049420815,0.64668578,-0.318198107,0.724407035,0.583641064,-0.704724761
-s004,-1.49698864,1.249697716,0.04787162,0.188726789,-0.819034985,-0.179096185
-s005,2.136690703,-0.868203102,-0.78604478,0.855744592,0.857935164,0.492256653
-s006,-0.355118237,0.517377129,0.928951769,0.792176927,-0.805270336,1.117546966
-s007,-0.778346825,1.683369425,-0.443459427,-0.593956209,4.0971389,-0.445679171
-s008,0.267208376,0.184556657,0.323158227,2.282268373,1.364794637,0.181174591
-s009,-0.386538967,-0.916456619,1.271967332,-0.052378684,-1.205062795,-0.626923254
-s010,0.435225064,0.91151586,-1.113652003,-0.220028617,-1.05347926,0.365272475
+subjectid,A,B,C,D,E,F
+s1,2,3,4,5,6,7
+s2,8,9,10,11,12,13
\ No newline at end of file
diff --git a/data/m3.csv b/data/m3.csv
index 9bf468c..d60cbd7 100644
--- a/data/m3.csv
+++ b/data/m3.csv
@@ -1,11 +1,3 @@
-SubjectID,A,B,C,D,E,F
-s001,-0.495294073,0.949116249,0.296072803,1.868387862,-0.272883702,-1.818801645
-s002,1.216439744,0.197072557,-0.081120879,1.469343652,2.263823391,0.181492295
-s003,-0.155607109,0.337023954,-0.458342088,-1.031167585,0.218811382,0.148051802
-s004,-1.209131999,-0.096524866,1.197362593,-0.062309653,-0.658751113,-0.262658666
-s005,0.645690766,0.899682779,-1.202114635,-0.452507338,0.178007526,-0.526872668
-s006,-0.527395342,-0.585397127,0.601057827,-0.438992879,9.23E-05,2.411401279
-s007,-0.781069044,-0.651766877,-0.003398167,-0.254586911,-0.048605563,1.6079838
-s008,-0.005850292,1.152494476,1.064747549,-0.227608884,1.45054756,1.422734322
-s009,0.796185038,-1.295533863,-0.007947827,0.624035116,-0.605764923,-0.856374829
-s010,0.952854212,-1.007389474,0.686420686,1.377020745,1.221967627,-0.120206896
+subjectid,A,B,C,D,E,F
+s1,3,4,5,6,7,8
+s2,9,10,11,12,13,14
\ No newline at end of file
diff --git a/data/m4.csv b/data/m4.csv
index 36e69eb..05cc7a1 100644
--- a/data/m4.csv
+++ b/data/m4.csv
@@ -1,11 +1,3 @@
-SubjectID,A,B,C,D,E,F
-s001,0.127830235,0.543904483,0.169190618,-0.849953283,-0.563713316,0.736931479
-s002,0.567418525,0.965856382,1.266015552,0.471422651,-0.758025824,-0.427404497
-s003,-1.221693479,-1.121073154,-1.677648371,2.016433719,-0.087967121,-0.472855621
-s004,0.954423388,-0.093452563,0.659446581,-0.190049419,-0.921771701,0.090774055
-s005,0.950052283,-0.621810664,0.254520025,0.360940315,-0.483358752,-0.935151931
-s006,1.455226207,-0.721900186,0.801810726,-0.641529199,0.563422873,0.772440661
-s007,-1.053644931,0.098930728,0.999364504,1.029298347,-0.632529862,-1.666171306
-s008,-0.671755474,0.389256225,0.697323813,-0.483432377,0.073658468,-0.233170802
-s009,0.059997347,0.583152369,-1.371183183,-0.528158479,0.435198404,0.705164885
-s010,-0.458500476,-1.526985622,0.370253517,0.844777527,-0.500950386,0.75340932
+subjectid,A,B,C,D,E,F
+s1,4,5,6,7,8,9
+s2,10,11,12,13,14,15
\ No newline at end of file
diff --git a/data/temp_output/pvals.csv b/data/temp_output/pvals.csv
new file mode 100644
index 0000000..14915f0
--- /dev/null
+++ b/data/temp_output/pvals.csv
@@ -0,0 +1,5 @@
+method,m1,m2,m3,m4
+m1,0.0,1.0,1.0,1.0
+m2,1.0,0.0,1.0,1.0
+m3,1.0,1.0,0.0,1.0
+m4,1.0,1.0,1.0,0.0
diff --git a/data/temp_output/ranks.csv b/data/temp_output/ranks.csv
new file mode 100644
index 0000000..e34367f
--- /dev/null
+++ b/data/temp_output/ranks.csv
@@ -0,0 +1,5 @@
+method,final_rank,cumulative_rank,average_rank_s1,average_rank_s2
+m1,2.5,5.0,2.5,2.5
+m2,2.5,5.0,2.5,2.5
+m3,2.5,5.0,2.5,2.5
+m4,2.5,5.0,2.5,2.5
diff --git a/pyranker/cli/run.py b/pyranker/cli/run.py
index 244f92a..9d2f081 100644
--- a/pyranker/cli/run.py
+++ b/pyranker/cli/run.py
@@ -162,6 +162,20 @@ def main(
             help="The comma-separated metric columns for which the reversal should be calculated; for example, 'hausdorff_tc,hausdorff_et'.",
         ),
     ] = "",
+    metric_to_use: Annotated[
+        str,
+        typer.Option(
+            "--metric-to-use",
+            help="The comma-separated metric columns to use for ranking.",
+        ),
+    ] = "",
+    weight: Annotated[
+        str,
+        typer.Option(
+            "--weight",
+            help="The comma-separated weights for the metrics.",
+        ),
+    ] = "",
     iterations: Annotated[
         int,
         typer.Option(
@@ -227,6 +241,21 @@ def main(
         metrics_for_reversal.split(",") if metrics_for_reversal else []
     )
 
+    metric_weights = {}
+    if metric_to_use and weight:
+        metrics = [m.strip() for m in metric_to_use.split(",")]
+        weights = [float(w.strip()) for w in weight.split(",")]
+        assert len(metrics) == len(weights), (
+            "Number of metrics and weights must be the same."
+        )
+
+        total_weight = sum(weights)
+        metric_weights = {
+            metric.lower().strip(): w / total_weight
+            for metric, w in zip(metrics, weights)
+        }
+        print(f"Using weighted ranking with metrics: {metric_weights}")
+
     num_subjects, num_metrics = validate_csvs(csvs_to_compare_with_full_path)
 
     # print the summary of the input files
@@ -242,6 +271,7 @@ def main(
         n_iterations=iterations,
         ranking_method=ranking_method,
         n_jobs=n_jobs,
+        metric_weights=metric_weights,
     )
     ranks, pvals = ranker.get_rankings_and_pvals()
     Path(outputdir).mkdir(parents=True, exist_ok=True)
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index b801c23..7bba987 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -65,7 +65,8 @@ def __init__(
         metrics_for_reversal: List[str],
         n_iterations: int = 1000,
         ranking_method: str = "average",
-        n_jobs: int = 4,  # New argument for number of CPU cores
+        n_jobs: int = 4,
+        metric_weights: Dict[str, float] = None,
     ) -> None:
         """
         Ranker class to compare the scores of different methods.
@@ -77,11 +78,13 @@ def __init__(
             ranking_method (str): The method to use for ranking the methods.
             n_jobs (int): The number of CPU cores to use for parallel processing. Defaults to 4.
                           Set to -1 to use all available cores.
+            metric_weights (Dict[str, float]): A dictionary of metric names and their weights.
         """
         self.input_csvs_to_compare = input_csvs_to_compare
         self.metrics_for_reversal = metrics_for_reversal
         self.n_iterations = n_iterations
         self.ranking_method = ranking_method
+        self.metric_weights = metric_weights if metric_weights else {}
 
         if n_jobs == -1:
             self.n_jobs = os.cpu_count()
@@ -131,6 +134,7 @@ def combine_csvs_and_scores(self) -> None:
                 ignore_index=True,
             )
 
+        self.metrics_per_subject.to_csv("metric_rank.csv", index=False)
         self.rank_methods()
 
     def rank_methods(self) -> None:
@@ -141,17 +145,6 @@ def rank_methods(self) -> None:
             method=self.ranking_method, ascending=True, numeric_only=True
         )
 
-        metrics_for_reversal_lower = [x.lower() for x in self.metrics_for_reversal]
-
-        for metric in metrics_for_reversal_lower:
-            for column in self.ranks_per_metric.columns:
-                if metric in column:
-                    self.ranks_per_metric[column] = (
-                        self.ranks_per_metric[column].max()
-                        + 1
-                        - self.ranks_per_metric[column]
-                    )
-
         cumulative_rank_column = self.ranks_per_metric.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
diff --git a/tests_full.py b/tests_full.py
index 6a936fd..6eac063 100644
--- a/tests_full.py
+++ b/tests_full.py
@@ -1,6 +1,7 @@
 from pathlib import Path
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 
 from pyranker.cli.run import main
 
@@ -43,9 +44,9 @@ def _sanity_check(output_dir: str) -> None:
 
 
 def test_main_dir_input():
-    cwd = Path.cwd()
-    test_data_dir = (cwd / "data").absolute().as_posix()
-    test_output_dir = (cwd / "data" / "temp_output").absolute().as_posix()
+    test_dir = Path(__file__).parent
+    test_data_dir = (test_dir / "data").absolute().as_posix()
+    test_output_dir = (test_dir / "data" / "temp_output").absolute().as_posix()
     main(
         input=test_data_dir,
         outputdir=test_output_dir,
@@ -56,9 +57,9 @@ def test_main_dir_input():
 
 
 def test_main_files_input():
-    cwd = Path.cwd()
-    test_data_dir = cwd / "data"
-    test_output_dir = (cwd / "data" / "temp_output").absolute().as_posix()
+    test_dir = Path(__file__).parent
+    test_data_dir = test_dir / "data"
+    test_output_dir = (test_dir / "data" / "temp_output").absolute().as_posix()
     input_files = ""
     for file in test_data_dir.iterdir():
         if file.suffix == ".csv":
@@ -71,3 +72,56 @@ def test_main_files_input():
     )
 
     _sanity_check(test_output_dir)
+
+
+def test_main_weighted_ranking(tmp_path):
+    """
+    Test the weighted ranking functionality.
+    """
+    # Create a temporary directory for test data
+    data_dir = tmp_path / "data"
+    data_dir.mkdir()
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+
+    # Create sample CSV files
+    method1_data = {
+        "subjectid": ["s1", "s2"],
+        "metricA": [10, 20],
+        "metricB": [0.1, 0.2],
+    }
+    method1_df = pd.DataFrame(method1_data)
+    method1_df.to_csv(data_dir / "method1.csv", index=False)
+
+    method2_data = {
+        "subjectid": ["s1", "s2"],
+        "metricA": [15, 5],
+        "metricB": [0.3, 0.4],
+    }
+    method2_df = pd.DataFrame(method2_data)
+    method2_df.to_csv(data_dir / "method2.csv", index=False)
+
+    # Call main with weighted ranking arguments
+    main(
+        input=str(data_dir),
+        outputdir=str(output_dir),
+        metrics_for_reversal="metricB",
+        metric_to_use="metricA,metricB",
+        weight="3,1",
+    )
+
+    # Check the output
+    ranks_file = output_dir / "ranks.csv"
+    assert ranks_file.exists(), "Ranks file does not exist"
+    ranks_df = pd.read_csv(ranks_file)
+
+    expected_ranks = {
+        "method1": 1.5,
+        "method2": 1.5,
+    }
+
+    for method, expected_rank in expected_ranks.items():
+        assert (
+            ranks_df[ranks_df["method"] == method]["final_rank"].values[0]
+            == expected_rank
+        ), f"Final rank for {method} is not as expected"

From 681d741d1962f0b52217655d39ed227df04965e3 Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Tue, 19 Aug 2025 12:47:47 -0400
Subject: [PATCH 07/11] needs documentation, but working as expected now

---
 pyranker/cli/run.py |  31 +--------
 pyranker/ranker.py  | 149 ++++++++++++++++++++++----------------------
 2 files changed, 75 insertions(+), 105 deletions(-)

diff --git a/pyranker/cli/run.py b/pyranker/cli/run.py
index 9d2f081..3c967d9 100644
--- a/pyranker/cli/run.py
+++ b/pyranker/cli/run.py
@@ -162,20 +162,6 @@ def main(
             help="The comma-separated metric columns for which the reversal should be calculated; for example, 'hausdorff_tc,hausdorff_et'.",
         ),
     ] = "",
-    metric_to_use: Annotated[
-        str,
-        typer.Option(
-            "--metric-to-use",
-            help="The comma-separated metric columns to use for ranking.",
-        ),
-    ] = "",
-    weight: Annotated[
-        str,
-        typer.Option(
-            "--weight",
-            help="The comma-separated weights for the metrics.",
-        ),
-    ] = "",
     iterations: Annotated[
         int,
         typer.Option(
@@ -241,21 +227,6 @@ def main(
         metrics_for_reversal.split(",") if metrics_for_reversal else []
     )
 
-    metric_weights = {}
-    if metric_to_use and weight:
-        metrics = [m.strip() for m in metric_to_use.split(",")]
-        weights = [float(w.strip()) for w in weight.split(",")]
-        assert len(metrics) == len(weights), (
-            "Number of metrics and weights must be the same."
-        )
-
-        total_weight = sum(weights)
-        metric_weights = {
-            metric.lower().strip(): w / total_weight
-            for metric, w in zip(metrics, weights)
-        }
-        print(f"Using weighted ranking with metrics: {metric_weights}")
-
     num_subjects, num_metrics = validate_csvs(csvs_to_compare_with_full_path)
 
     # print the summary of the input files
@@ -271,7 +242,7 @@ def main(
         n_iterations=iterations,
         ranking_method=ranking_method,
         n_jobs=n_jobs,
-        metric_weights=metric_weights,
+        output_dir=outputdir,
     )
     ranks, pvals = ranker.get_rankings_and_pvals()
     Path(outputdir).mkdir(parents=True, exist_ok=True)
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 7bba987..86f70a8 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -66,142 +66,139 @@ def __init__(
         n_iterations: int = 1000,
         ranking_method: str = "average",
         n_jobs: int = 4,
-        metric_weights: Dict[str, float] = None,
+        output_dir: str = ".",
+        detailed_ranks_csv_name: str = "detailed_ranks.csv",
     ) -> None:
         """
         Ranker class to compare the scores of different methods.
-
-        Args:
-            input_csvs_to_compare (Dict[str, str]): A dictionary with the key being the method name and the value being the path to the CSV file.
-            metrics_for_reversal (List[str]): The metrics for which the reversal should be calculated.
-            n_iterations (int): The number of iterations to perform for the permutation test.
-            ranking_method (str): The method to use for ranking the methods.
-            n_jobs (int): The number of CPU cores to use for parallel processing. Defaults to 4.
-                          Set to -1 to use all available cores.
-            metric_weights (Dict[str, float]): A dictionary of metric names and their weights.
         """
         self.input_csvs_to_compare = input_csvs_to_compare
         self.metrics_for_reversal = metrics_for_reversal
         self.n_iterations = n_iterations
         self.ranking_method = ranking_method
-        self.metric_weights = metric_weights if metric_weights else {}
+        self.output_dir = output_dir
+        self.detailed_ranks_csv_name = detailed_ranks_csv_name
 
         if n_jobs == -1:
             self.n_jobs = os.cpu_count()
         else:
             self.n_jobs = n_jobs
 
-        self.combined_scores_per_subject = {}
-        self.participant_ranks = {}
-
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.detailed_rank_columns = []
+        self.all_subject_ids = set()
         self.combine_csvs_and_scores()
 
     def combine_csvs_and_scores(self) -> None:
         """
         Combine the CSVs and scores of the methods.
         """
-        self.combined_scores_per_subject["method"] = []
         self.metrics_per_subject = pd.DataFrame()
 
         for method in self.input_csvs_to_compare:
-            self.combined_scores_per_subject["method"].append(method)
             current_df = pd.read_csv(self.input_csvs_to_compare[method])
             current_df.columns = current_df.columns.str.lower()
+            self.all_subject_ids.update(current_df["subjectid"].unique())
             current_df = current_df.sort_values(by="subjectid")
-
-            metrics_columns = sorted(
-                [col for col in current_df.columns if col != "subjectid"]
-            )
+            metrics_columns = [col for col in current_df.columns if col != "subjectid"]
+            metrics_columns.sort()
             current_df = current_df[["subjectid"] + metrics_columns]
-
-            for metric in metrics_columns:
-                current_df[metric] = current_df[metric].rank(
-                    method=self.ranking_method, ascending=False
-                )
-
             current_df_flattened = {"method": method}
             for _, row in current_df.iterrows():
-                for metric in current_df.columns:
-                    if metric != "subjectid":
-                        current_df_flattened[f"{metric}_{row['subjectid']}"] = row[
-                            metric
-                        ]
-
+                for metric in metrics_columns:
+                    current_df_flattened[f"{metric}_{row['subjectid']}"] = row[metric]
             current_df_flattened = pd.DataFrame(current_df_flattened, index=[0])
             self.metrics_per_subject = pd.concat(
-                [self.metrics_per_subject, current_df_flattened],
-                axis=0,
-                ignore_index=True,
-            )
+                [self.metrics_per_subject, current_df_flattened], axis=0
+            ).reset_index(drop=True)
 
-        self.metrics_per_subject.to_csv("metric_rank.csv", index=False)
         self.rank_methods()
 
     def rank_methods(self) -> None:
         """
-        Rank the methods based on each metric and calculate the combined final rank.
+        Rank the methods based on the metrics using the new two-step aggregation.
         """
-        self.ranks_per_metric = self.metrics_per_subject.rank(
-            method=self.ranking_method, ascending=True, numeric_only=True
+        ranks_per_metric_detailed = self.metrics_per_subject.rank(
+            method=self.ranking_method, ascending=False, numeric_only=True
         )
+        metrics_for_reversal_lower = [x.lower() for x in self.metrics_for_reversal]
+        for metric in metrics_for_reversal_lower:
+            for column in ranks_per_metric_detailed.columns:
+                if metric in column:
+                    ranks_per_metric_detailed[column] = (
+                        ranks_per_metric_detailed[column].max()
+                        + 1
+                        - ranks_per_metric_detailed[column]
+                    )
+        self.detailed_rank_columns = ranks_per_metric_detailed.columns.tolist()
+        verification_df = ranks_per_metric_detailed.copy()
+        verification_df.insert(0, "method", self.metrics_per_subject["method"])
+        verification_path = os.path.join(self.output_dir, self.detailed_ranks_csv_name)
+        verification_df.to_csv(verification_path, index=False)
+        print(f"Saved detailed verification ranks to: {verification_path}")
+
+        # --- FIX 1: EFFICIENT DATAFRAME CREATION ---
+        # Create a dictionary to hold new columns first.
+        subject_avg_rank_data = {}
+        subject_ids_sorted = sorted(list(self.all_subject_ids))
+
+        for subject in subject_ids_sorted:
+            subject_cols = [
+                col for col in self.detailed_rank_columns if col.endswith(f"_{subject}")
+            ]
+            if subject_cols:
+                # Add the new Series to the dictionary instead of the DataFrame.
+                subject_avg_rank_data[f"{subject}_avg_rank"] = (
+                    ranks_per_metric_detailed[subject_cols].mean(axis=1)
+                )
+
+        # Create the DataFrame from the dictionary in a single, efficient operation.
+        per_subject_avg_ranks = pd.DataFrame(subject_avg_rank_data)
+        # --- END OF FIX 1 ---
 
-        cumulative_rank_column = self.ranks_per_metric.sum(axis=1)
+        cumulative_rank_column = per_subject_avg_ranks.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
         )
-
         self.ranks_per_metric = pd.concat(
             [
-                self.ranks_per_metric,
-                cumulative_rank_column.rename("cumulative_rank"),
-                final_rank_column.rename("final_rank"),
                 self.metrics_per_subject["method"],
+                final_rank_column.rename("final_rank"),
+                cumulative_rank_column.rename("cumulative_rank"),
+                per_subject_avg_ranks,
+                ranks_per_metric_detailed,
             ],
             axis=1,
         )
-
-        self.ranks_per_metric = self.ranks_per_metric[
-            ["method", "final_rank", "cumulative_rank"]
-            + [
-                col
-                for col in self.ranks_per_metric.columns
-                if col not in ["method", "final_rank", "cumulative_rank"]
-            ]
-        ]
-
         self.perform_permutation_test()
 
     def perform_permutation_test(self) -> None:
         """
-        Perform permutation test in parallel to determine the significance of the ranks.
+        Perform permutation test to determine the significance of the ranks.
+        This test is performed on the detailed rank data.
         """
         n_methods = len(self.ranks_per_metric)
         self.pvals = np.zeros((n_methods, n_methods))
-
         ranks_per_metric_sorted = self.ranks_per_metric.sort_values(
             by="cumulative_rank"
         ).reset_index(drop=True)
 
-        ranks_per_metric_sanitized = ranks_per_metric_sorted.drop(
-            columns=["method", "cumulative_rank", "final_rank"]
-        )
+        # --- FIX 2: SELECT ONLY DETAILED RANK COLUMNS FOR THE TEST ---
+        # Instead of dropping columns, explicitly select the correct ones.
+        # This is robust and prevents contamination from summary columns.
+        ranks_per_metric_sanitized = ranks_per_metric_sorted[self.detailed_rank_columns]
+        # --- END OF FIX 2 ---
 
-        # Use ProcessPoolExecutor for parallel processing
         with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
-            # A dictionary to map future objects to their matrix indices (i, j)
             future_to_indices = {}
-            # Submit all pairs to the executor
             for i in range(n_methods):
                 for j in range(i + 1, n_methods):
                     arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
                     arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
-                    # Submit the worker function with its arguments
                     future = executor.submit(
                         _calculate_pval_for_pair, arr_i, arr_j, self.n_iterations
                     )
                     future_to_indices[future] = (i, j)
-
-            # Create a progress bar that updates as tasks are completed
             pbar = tqdm(
                 as_completed(future_to_indices),
                 total=len(future_to_indices),
@@ -212,7 +209,7 @@ def perform_permutation_test(self) -> None:
                 try:
                     pval = future.result()
                     self.pvals[i, j] = pval
-                    self.pvals[j, i] = pval  # p-value is symmetric
+                    self.pvals[j, i] = pval
                 except Exception as exc:
                     print(f"Pair ({i}, {j}) generated an exception: {exc}")
 
@@ -224,15 +221,17 @@ def perform_permutation_test(self) -> None:
 
     def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
-        Get the final rankings and p-values.
-
-        Returns:
-            Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the rankings and p-values dataframes.
+        Get the final rankings and p-values, storing only the upper right diagonal of the p-values matrix.
         """
         ranks_df = self.ranks_per_metric.sort_values(by="final_rank").reset_index(
             drop=True
         )
-        pvals_df = self.pvals_df.reindex(
-            index=ranks_df["method"], columns=ranks_df["method"]
-        )
-        return ranks_df, pvals_df
+
+        # Create a DataFrame for the upper right diagonal of the p-values matrix
+        pvals_upper_df = pd.DataFrame(
+            np.triu(self.pvals),  # Use np.triu to get the upper triangle of the matrix
+            columns=self.pvals_df.columns,
+            index=self.pvals_df.index,
+        ).reindex(index=ranks_df["method"], columns=ranks_df["method"])
+
+        return ranks_df, pvals_upper_df

From 081a6103b94844233a1c5ce4da5adfaa082c980f Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Tue, 19 Aug 2025 15:27:08 -0400
Subject: [PATCH 08/11] ready to PR

---
 README.md          |  68 +++++++++++++++++++++--
 pyranker/ranker.py | 136 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 181 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 8cb5675..bc264a1 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,67 @@
 # PyRanker
 
-This package is designed to compare the performance of different methods. 
+This package is designed to compare the performance of different methods.
+
+## Algorithm
+
+The Ranker class compares the performance of different methods based on a set of
+metrics. It takes as input a dictionary of CSV files, where each file
+represents a method and contains the scores for a set of subjects on a set of
+metrics.
+
+The ranking algorithm consists of the following steps:
+
+1.  **Combine CSVs and Scores**: The class first combines all the input CSV
+    files into a single DataFrame. This DataFrame has a hierarchical column
+    structure, where the top level represents the metrics and the bottom level
+    represents the subjects.
+
+2.  **Rank Methods**: The class then ranks the methods based on their scores for
+    each metric and subject. The ranking can be done using different methods,
+    such as 'average', 'min', 'max', 'first', or 'dense'.
+
+3.  **Handle Metric Reversal**: For metrics where lower values are better (e.g.,
+    error rates), the class can reverse the ranks so that lower scores get
+    higher ranks.
+
+4.  **Aggregate Ranks**: The class then aggregates the ranks across all metrics
+    for each subject to get a per-subject average rank for each method.
+
+5.  **Calculate Cumulative Rank**: The per-subject average ranks are then summed
+    up to get a cumulative rank for each method.
+
+6.  **Determine Final Rank**: The methods are then ranked based on their
+    cumulative ranks to determine the final ranking.
+
+7.  **Perform Permutation Test**: Finally, the class performs a permutation test
+    to determine the statistical significance of the differences in the ranks
+    of the methods. The permutation test is a non-parametric method that does
+    not make any assumptions about the distribution of the data.
+
+The output of the Ranker class is a pair of DataFrames: one containing the
+final rankings of the methods, and another containing the p-values from the
+permutation test.
+
+### Permutation Test
+
+The permutation test is a non-parametric method for testing the statistical
+significance of an observed difference between two groups. In this case, the
+two groups are the ranks of two different methods.
+
+The null hypothesis is that the two methods are equivalent, and any observed
+difference in their ranks is due to chance. The alternative hypothesis is that
+the two methods are not equivalent, and the observed difference in their ranks
+is statistically significant.
+
+The test works by repeatedly shuffling the ranks between the two methods and
+calculating the difference in their sums. The p-value is the proportion of
+permutations that result in a difference as or more extreme than the
+observed difference.
 
 ## Installation
 
 ```sh
-(base) user@location $> git clone https://github.com/mlcommons/PyRanker.git 
+(base) user@location $> git clone https://github.com/mlcommons/PyRanker.git
 (base) user@location $> cd PyRanker
 (base) user@PyRanker $> conda create -p ./venv python=3.12 -y
 (base) user@PyRanker $> conda activate ./venv
@@ -41,10 +97,10 @@ This package is designed to compare the performance of different methods.
 
 2. **Metrics for reversal normalization**: a comma-separated list of metrics that need to be normalized in reverse. For metrics such as [Hausdorff Distance](https://en.wikipedia.org/wiki/Hausdorff_distance) and communication cost (used in the [FeTS Challenge](https://doi.org/10.48550/arXiv.2105.05874)) which are defined as "higher is worse", PyRanker can normalize in reverse order.
    - This is checked in a case-insensitive manner, so `C,F` is equivalent to `c,f`.
-   - The check is done by checking for the presence of the string in the metric header, rather than a "hard" check. For example, passing `hausd` **will** match `hausd*` in the metric headers, and will be case-insensitive. This is done to allow for flexibility in the metric names. 
-   - The metric string needs to be present. For example, passing `dsc` **will not** match for `dice*` in the metric headers. 
+   - The check is done by checking for the presence of the string in the metric header, rather than a "hard" check. For example, passing `hausd` **will** match `hausd*` in the metric headers, and will be case-insensitive. This is done to allow for flexibility in the metric names.
+   - The metric string needs to be present. For example, passing `dsc` **will not** match for `dice*` in the metric headers.
 
-3. **Ranking method**: the ranking method used to rank the methods. The available options are [[ref](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rank.html#pandas-dataframe-rank)]: 
+3. **Ranking method**: the ranking method used to rank the methods. The available options are [[ref](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rank.html#pandas-dataframe-rank)]:
    - `average` (default): average rank of the group
    - `min`: lowest rank in the group
    - `max`: highest rank in the group
@@ -73,4 +129,4 @@ To get detailed help, please run ```ranker --help```.
 
 ## Acknowledgements
 
-This tool was partly supported by the [Informatics Technology for Cancer Research (ITCR) program](https://www.cancer.gov/about-nci/organization/cssi/research/itcr) of the [National Cancer Institute (NCI)](https://www.cancer.gov/) at the [National Institutes of Health (NIH)](https://www.nih.gov/) under award numbers [U01CA242871](https://reporter.nih.gov/search/8qcT1J34hEyj5npqmq9aEw/project-details/10009302) and [U24CA279629](https://reporter.nih.gov/search/8qcT1J34hEyj5npqmq9aEw/project-details/10932257). The content of this tool is solely the responsibility of the authors and does not represent the official views of the NIH.
+This tool was partly supported by the [Informatics Technology for Cancer Research (ITCR) program](https://www.cancer.gov/about-nci/organization/cssi/research/itcr) of the [National Cancer Institute (NCI)](https://www.cancer.gov/) at the [National Institutes of Health (NIH)](https://www.nih.gov/) under award numbers [U01CA242871](https://reporter.nih.gov/search/8qcT1J34hEyj5npqmq9aEw/project-details/10009302) and [U24CA279629](https://reporter.nih.gov/search/8qcT1J34hEyj5npqmq9aEw/project-details/10932257). The content of this tool is solely the responsibility of the authors and does not represent the official views of the NIH.
\ No newline at end of file
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 86f70a8..97977b8 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -15,6 +15,20 @@ def _calculate_pval_for_pair(
     """
     Performs the permutation test for a single pair of rank arrays.
 
+    The permutation test is a non-parametric method for testing the statistical
+    significance of an observed difference between two groups. In this case, the
+    two groups are the ranks of two different methods.
+
+    The null hypothesis is that the two methods are equivalent, and any observed
+    difference in their ranks is due to chance. The alternative hypothesis is that
+    the two methods are not equivalent, and the observed difference in their ranks
+    is statistically significant.
+
+    The test works by repeatedly shuffling the ranks between the two methods and
+    calculating the difference in their sums. The p-value is the proportion of
+    permutations that result in a difference as or more extreme than the
+    observed difference.
+
     Args:
         arr_i (np.ndarray): Rank array for the first method.
         arr_j (np.ndarray): Rank array for the second method.
@@ -59,6 +73,46 @@ def _calculate_pval_for_pair(
 
 
 class Ranker:
+    """
+    The Ranker class compares the performance of different methods based on a set of
+    metrics. It takes as input a dictionary of CSV files, where each file
+    represents a method and contains the scores for a set of subjects on a set of
+    metrics.
+
+    The ranking algorithm consists of the following steps:
+
+    1.  **Combine CSVs and Scores**: The class first combines all the input CSV
+        files into a single DataFrame. This DataFrame has a hierarchical column
+        structure, where the top level represents the metrics and the bottom level
+        represents the subjects.
+
+    2.  **Rank Methods**: The class then ranks the methods based on their scores for
+        each metric and subject. The ranking can be done using different methods,
+        such as 'average', 'min', 'max', 'first', or 'dense'.
+
+    3.  **Handle Metric Reversal**: For metrics where lower values are better (e.g.,
+        error rates), the class can reverse the ranks so that lower scores get
+        higher ranks.
+
+    4.  **Aggregate Ranks**: The class then aggregates the ranks across all metrics
+        for each subject to get a per-subject average rank for each method.
+
+    5.  **Calculate Cumulative Rank**: The per-subject average ranks are then summed
+        up to get a cumulative rank for each method.
+
+    6.  **Determine Final Rank**: The methods are then ranked based on their
+        cumulative ranks to determine the final ranking.
+
+    7.  **Perform Permutation Test**: Finally, the class performs a permutation test
+        to determine the statistical significance of the differences in the ranks
+        of the methods. The permutation test is a non-parametric method that does
+        not make any assumptions about the distribution of the data.
+
+    The output of the Ranker class is a pair of DataFrames: one containing the
+    final rankings of the methods, and another containing the p-values from the
+    permutation test.
+    """
+
     def __init__(
         self,
         input_csvs_to_compare: Dict[str, str],
@@ -70,7 +124,26 @@ def __init__(
         detailed_ranks_csv_name: str = "detailed_ranks.csv",
     ) -> None:
         """
-        Ranker class to compare the scores of different methods.
+        Initializes the Ranker class.
+
+        Args:
+            input_csvs_to_compare (Dict[str, str]): A dictionary where the keys are
+                the method names and the values are the paths to the CSV files
+                containing the scores for each method.
+            metrics_for_reversal (List[str]): A list of metrics for which the
+                ranks should be reversed (i.e., lower values are better).
+            n_iterations (int, optional): The number of iterations to perform for
+                the permutation test. Defaults to 1000.
+            ranking_method (str, optional): The method to use for ranking the
+                methods. Can be one of 'average', 'min', 'max', 'first', or
+                'dense'. Defaults to "average".
+            n_jobs (int, optional): The number of CPU cores to use for parallel
+                processing. Defaults to 4.
+            output_dir (str, optional): The directory where the output files will
+                be saved. Defaults to ".".
+            detailed_ranks_csv_name (str, optional): The name of the CSV file
+                where the detailed ranks will be saved. Defaults to
+                "detailed_ranks.csv".
         """
         self.input_csvs_to_compare = input_csvs_to_compare
         self.metrics_for_reversal = metrics_for_reversal
@@ -91,7 +164,11 @@ def __init__(
 
     def combine_csvs_and_scores(self) -> None:
         """
-        Combine the CSVs and scores of the methods.
+        Combines the input CSV files into a single DataFrame.
+
+        This method reads each CSV file, converts the column names to lowercase,
+        and then flattens the DataFrame so that each row represents a method and
+        each column represents a metric-subject combination.
         """
         self.metrics_per_subject = pd.DataFrame()
 
@@ -116,11 +193,20 @@ def combine_csvs_and_scores(self) -> None:
 
     def rank_methods(self) -> None:
         """
-        Rank the methods based on the metrics using the new two-step aggregation.
+        Ranks the methods based on the metrics using a two-step aggregation process.
+
+        First, it ranks the methods for each metric and subject combination.
+        Then, it calculates the average rank for each method across all metrics
+        for each subject. Finally, it sums up the per-subject average ranks to
+        get a cumulative rank for each method, which is then used to determine
+        the final ranking.
         """
+        # Rank the methods for each metric-subject combination
         ranks_per_metric_detailed = self.metrics_per_subject.rank(
             method=self.ranking_method, ascending=False, numeric_only=True
         )
+
+        # Reverse the ranks for the specified metrics
         metrics_for_reversal_lower = [x.lower() for x in self.metrics_for_reversal]
         for metric in metrics_for_reversal_lower:
             for column in ranks_per_metric_detailed.columns:
@@ -131,35 +217,38 @@ def rank_methods(self) -> None:
                         - ranks_per_metric_detailed[column]
                     )
         self.detailed_rank_columns = ranks_per_metric_detailed.columns.tolist()
+
+        # Save the detailed ranks to a CSV file for verification
         verification_df = ranks_per_metric_detailed.copy()
         verification_df.insert(0, "method", self.metrics_per_subject["method"])
         verification_path = os.path.join(self.output_dir, self.detailed_ranks_csv_name)
         verification_df.to_csv(verification_path, index=False)
         print(f"Saved detailed verification ranks to: {verification_path}")
 
-        # --- FIX 1: EFFICIENT DATAFRAME CREATION ---
-        # Create a dictionary to hold new columns first.
+        # Create a dictionary to hold the per-subject average ranks
         subject_avg_rank_data = {}
         subject_ids_sorted = sorted(list(self.all_subject_ids))
 
+        # Calculate the average rank for each method for each subject
         for subject in subject_ids_sorted:
             subject_cols = [
                 col for col in self.detailed_rank_columns if col.endswith(f"_{subject}")
             ]
             if subject_cols:
-                # Add the new Series to the dictionary instead of the DataFrame.
                 subject_avg_rank_data[f"{subject}_avg_rank"] = (
                     ranks_per_metric_detailed[subject_cols].mean(axis=1)
                 )
 
-        # Create the DataFrame from the dictionary in a single, efficient operation.
+        # Create a DataFrame from the dictionary of per-subject average ranks
         per_subject_avg_ranks = pd.DataFrame(subject_avg_rank_data)
-        # --- END OF FIX 1 ---
 
+        # Calculate the cumulative and final ranks
         cumulative_rank_column = per_subject_avg_ranks.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
         )
+
+        # Combine all the rank information into a single DataFrame
         self.ranks_per_metric = pd.concat(
             [
                 self.metrics_per_subject["method"],
@@ -170,12 +259,14 @@ def rank_methods(self) -> None:
             ],
             axis=1,
         )
+
+        # Perform the permutation test to determine the statistical significance
         self.perform_permutation_test()
 
     def perform_permutation_test(self) -> None:
         """
-        Perform permutation test to determine the significance of the ranks.
-        This test is performed on the detailed rank data.
+        Performs a permutation test to determine the statistical significance of the
+        ranks. This test is performed on the detailed rank data.
         """
         n_methods = len(self.ranks_per_metric)
         self.pvals = np.zeros((n_methods, n_methods))
@@ -183,12 +274,10 @@ def perform_permutation_test(self) -> None:
             by="cumulative_rank"
         ).reset_index(drop=True)
 
-        # --- FIX 2: SELECT ONLY DETAILED RANK COLUMNS FOR THE TEST ---
-        # Instead of dropping columns, explicitly select the correct ones.
-        # This is robust and prevents contamination from summary columns.
+        # Select only the detailed rank columns for the test
         ranks_per_metric_sanitized = ranks_per_metric_sorted[self.detailed_rank_columns]
-        # --- END OF FIX 2 ---
 
+        # Use a process pool to parallelize the p-value calculations
         with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
             future_to_indices = {}
             for i in range(n_methods):
@@ -199,6 +288,8 @@ def perform_permutation_test(self) -> None:
                         _calculate_pval_for_pair, arr_i, arr_j, self.n_iterations
                     )
                     future_to_indices[future] = (i, j)
+
+            # Show a progress bar for the permutation test
             pbar = tqdm(
                 as_completed(future_to_indices),
                 total=len(future_to_indices),
@@ -208,11 +299,14 @@ def perform_permutation_test(self) -> None:
                 i, j = future_to_indices[future]
                 try:
                     pval = future.result()
-                    self.pvals[i, j] = pval
-                    self.pvals[j, i] = pval
+                    # Format p-value with precision and scientific notation
+                    formatted_pval = f"{pval:.3f}" if pval >= 0.001 else f"{pval:.1e}"
+                    self.pvals[i, j] = formatted_pval
+                    self.pvals[j, i] = formatted_pval
                 except Exception as exc:
                     print(f"Pair ({i}, {j}) generated an exception: {exc}")
 
+        # Create a DataFrame from the p-values
         self.pvals_df = pd.DataFrame(
             self.pvals,
             columns=ranks_per_metric_sorted["method"],
@@ -221,7 +315,15 @@ def perform_permutation_test(self) -> None:
 
     def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
-        Get the final rankings and p-values, storing only the upper right diagonal of the p-values matrix.
+        Returns the final rankings and p-values.
+
+        The p-values matrix is returned as a DataFrame with only the upper right
+        diagonal, to avoid redundancy.
+
+        Returns:
+            Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
+                - The final rankings of the methods.
+                - The p-values from the permutation test.
         """
         ranks_df = self.ranks_per_metric.sort_values(by="final_rank").reset_index(
             drop=True

From db7b56e80752fe32b8e4ad24910a6adacaf36ab9 Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Thu, 28 Aug 2025 13:43:00 -0400
Subject: [PATCH 09/11] pvalue calculation is now fixed

---
 data/temp_output/detailed_ranks.csv |  5 +++++
 data/temp_output/pvals.csv          | 10 +++++-----
 data/temp_output/ranks.csv          | 10 +++++-----
 pyranker/ranker.py                  | 20 ++++++++++++--------
 test_ranking_fix.py                 |  0
 5 files changed, 27 insertions(+), 18 deletions(-)
 create mode 100644 data/temp_output/detailed_ranks.csv
 create mode 100644 test_ranking_fix.py

diff --git a/data/temp_output/detailed_ranks.csv b/data/temp_output/detailed_ranks.csv
new file mode 100644
index 0000000..6648b3a
--- /dev/null
+++ b/data/temp_output/detailed_ranks.csv
@@ -0,0 +1,5 @@
+method,a_s1,b_s1,c_s1,d_s1,e_s1,f_s1,a_s2,b_s2,c_s2,d_s2,e_s2,f_s2
+m1,4.0,4.0,1.0,4.0,4.0,1.0,4.0,4.0,1.0,4.0,4.0,1.0
+m2,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0
+m3,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0
+m4,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0
diff --git a/data/temp_output/pvals.csv b/data/temp_output/pvals.csv
index 14915f0..514c997 100644
--- a/data/temp_output/pvals.csv
+++ b/data/temp_output/pvals.csv
@@ -1,5 +1,5 @@
-method,m1,m2,m3,m4
-m1,0.0,1.0,1.0,1.0
-m2,1.0,0.0,1.0,1.0
-m3,1.0,1.0,0.0,1.0
-m4,1.0,1.0,1.0,0.0
+method,m4,m3,m2,m1
+m4,0.0,0.928,0.926,0.925
+m3,0.0,0.0,0.928,0.926
+m2,0.0,0.0,0.0,0.927
+m1,0.0,0.0,0.0,0.0
diff --git a/data/temp_output/ranks.csv b/data/temp_output/ranks.csv
index e34367f..e555038 100644
--- a/data/temp_output/ranks.csv
+++ b/data/temp_output/ranks.csv
@@ -1,5 +1,5 @@
-method,final_rank,cumulative_rank,average_rank_s1,average_rank_s2
-m1,2.5,5.0,2.5,2.5
-m2,2.5,5.0,2.5,2.5
-m3,2.5,5.0,2.5,2.5
-m4,2.5,5.0,2.5,2.5
+method,final_rank,cumulative_rank,s1_avg_rank,s2_avg_rank,a_s1,b_s1,c_s1,d_s1,e_s1,f_s1,a_s2,b_s2,c_s2,d_s2,e_s2,f_s2
+m4,1.0,4.0,2.0,2.0,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,4.0
+m3,2.0,4.666666666666667,2.3333333333333335,2.3333333333333335,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0
+m2,3.0,5.333333333333333,2.6666666666666665,2.6666666666666665,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0
+m1,4.0,6.0,3.0,3.0,4.0,4.0,1.0,4.0,4.0,1.0,4.0,4.0,1.0,4.0,4.0,1.0
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 97977b8..bf1a086 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -60,15 +60,12 @@ def _calculate_pval_for_pair(
         permuted_diff = arr1_rand.sum() - arr2_rand.sum()
 
         # Check if the permuted difference is as or more extreme
-        if permuted_diff >= observed_diff:
+        if permuted_diff <= observed_diff:
             count_extreme += 1
 
-    # To avoid p-values of 0, which implies absolute certainty, we add 1
-    # if no permuted difference was more extreme than the observed one.
-    if count_extreme == 0:
-        count_extreme = 1
-
-    pval = count_extreme / n_iterations
+    # Calculate the p-value using the standard formula for permutation tests,
+    # which adds 1 to both the numerator and denominator to avoid p-values of 0.
+    pval = (count_extreme + 1) / (n_iterations + 1)
     return pval
 
 
@@ -183,7 +180,13 @@ def combine_csvs_and_scores(self) -> None:
             current_df_flattened = {"method": method}
             for _, row in current_df.iterrows():
                 for metric in metrics_columns:
-                    current_df_flattened[f"{metric}_{row['subjectid']}"] = row[metric]
+                    score = row[metric]
+                    if not pd.api.types.is_number(score):
+                        raise ValueError(
+                            f"Invalid score for method '{method}', subject '{row['subjectid']}', "
+                            f"metric '{metric}'. Expected a number, but got '{score}'."
+                        )
+                    current_df_flattened[f"{metric}_{row['subjectid']}"] = score
             current_df_flattened = pd.DataFrame(current_df_flattened, index=[0])
             self.metrics_per_subject = pd.concat(
                 [self.metrics_per_subject, current_df_flattened], axis=0
@@ -202,6 +205,7 @@ def rank_methods(self) -> None:
         the final ranking.
         """
         # Rank the methods for each metric-subject combination
+        # Use ascending=False so that higher values get better ranks (original logic)
         ranks_per_metric_detailed = self.metrics_per_subject.rank(
             method=self.ranking_method, ascending=False, numeric_only=True
         )
diff --git a/test_ranking_fix.py b/test_ranking_fix.py
new file mode 100644
index 0000000..e69de29

From 2015300ad28717a5eb0dde5dcf6ec7a517d4efce Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Thu, 4 Sep 2025 21:07:41 -0400
Subject: [PATCH 10/11] need to fix based on newer updates

---
 pyranker/cli/run.py |  4 +--
 pyranker/ranker.py  | 67 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/pyranker/cli/run.py b/pyranker/cli/run.py
index 3c967d9..84e6dad 100644
--- a/pyranker/cli/run.py
+++ b/pyranker/cli/run.py
@@ -120,7 +120,7 @@ def __get_sorted_metrics(df: pd.DataFrame) -> list:
                 current_metrics = __get_sorted_metrics(current_df)
                 if current_metrics != metrics_base:
                     sanity_checks["Files_with_different_metrics"].append(filename)
-        except Exception as e:
+        except Exception:
             sanity_checks["Files_that_cannot_be_read"].append(filename)
 
     # if any of the sanity checks fail, print the problematic files and exit
@@ -185,7 +185,7 @@ def main(
             "--n-jobs",
             help="The number of CPU cores to use for parallel processing.",
         ),
-    ] = 4,
+    ] = 1,
     version: Annotated[
         Optional[bool],
         typer.Option(
diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index bf1a086..4d383af 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Dict, List, Tuple
@@ -6,11 +7,23 @@
 import pandas as pd
 from tqdm import tqdm
 
+# Set up a global logger
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    filename="ranker.log",
+    filemode="w",
+)
+logger = logging.getLogger(__name__)
+
 
 # This worker function is defined at the top level so it can be pickled
 # and sent to other processes by the ProcessPoolExecutor.
 def _calculate_pval_for_pair(
-    arr_i: np.ndarray, arr_j: np.ndarray, n_iterations: int
+    arr_i: np.ndarray,
+    arr_j: np.ndarray,
+    n_iterations: int,
+    log_permutations: bool = False,
 ) -> float:
     """
     Performs the permutation test for a single pair of rank arrays.
@@ -33,19 +46,23 @@ def _calculate_pval_for_pair(
         arr_i (np.ndarray): Rank array for the first method.
         arr_j (np.ndarray): Rank array for the second method.
         n_iterations (int): The number of permutation iterations.
+        log_permutations (bool, optional): Whether to log detailed permutation
+            information. Defaults to False.
 
     Returns:
         float: The calculated p-value.
     """
     # Use the difference for a one-sided test
     observed_diff = arr_i.sum() - arr_j.sum()
+    if log_permutations:
+        print(f"Observed difference in sums: {observed_diff}")
     count_extreme = 0
 
     # Create a local random number generator for thread-safety
     rng = np.random.default_rng()
 
     # Perform the permutation test
-    for _ in range(n_iterations):
+    for i in range(n_iterations):
         # Generate a random permutation mask
         r = rng.integers(0, 2, size=arr_i.shape, dtype=bool)
 
@@ -58,14 +75,24 @@ def _calculate_pval_for_pair(
 
         # Calculate the difference in ranks for the random permutation
         permuted_diff = arr1_rand.sum() - arr2_rand.sum()
+        if log_permutations:
+            print(
+                f"Permutation {i + 1}/{n_iterations} | Permuted diff: {permuted_diff}"
+            )
 
         # Check if the permuted difference is as or more extreme
         if permuted_diff <= observed_diff:
+            if log_permutations:
+                print(
+                    f"Permutation {i + 1} is more extreme: {permuted_diff} <= {observed_diff}"
+                )
             count_extreme += 1
 
     # Calculate the p-value using the standard formula for permutation tests,
     # which adds 1 to both the numerator and denominator to avoid p-values of 0.
     pval = (count_extreme + 1) / (n_iterations + 1)
+    if log_permutations:
+        print(f"Final p-value: {pval}")
     return pval
 
 
@@ -119,6 +146,7 @@ def __init__(
         n_jobs: int = 4,
         output_dir: str = ".",
         detailed_ranks_csv_name: str = "detailed_ranks.csv",
+        log_permutations: bool = False,
     ) -> None:
         """
         Initializes the Ranker class.
@@ -141,6 +169,8 @@ def __init__(
             detailed_ranks_csv_name (str, optional): The name of the CSV file
                 where the detailed ranks will be saved. Defaults to
                 "detailed_ranks.csv".
+            log_permutations (bool, optional): Whether to log detailed permutation
+                information. Defaults to False.
         """
         self.input_csvs_to_compare = input_csvs_to_compare
         self.metrics_for_reversal = metrics_for_reversal
@@ -148,6 +178,7 @@ def __init__(
         self.ranking_method = ranking_method
         self.output_dir = output_dir
         self.detailed_ranks_csv_name = detailed_ranks_csv_name
+        self.log_permutations = log_permutations
 
         if n_jobs == -1:
             self.n_jobs = os.cpu_count()
@@ -157,6 +188,7 @@ def __init__(
         os.makedirs(self.output_dir, exist_ok=True)
         self.detailed_rank_columns = []
         self.all_subject_ids = set()
+        print("Ranker initialized.")
         self.combine_csvs_and_scores()
 
     def combine_csvs_and_scores(self) -> None:
@@ -167,9 +199,11 @@ def combine_csvs_and_scores(self) -> None:
         and then flattens the DataFrame so that each row represents a method and
         each column represents a metric-subject combination.
         """
+        print("Combining CSVs and scores...")
         self.metrics_per_subject = pd.DataFrame()
 
         for method in self.input_csvs_to_compare:
+            print(f"Processing method: {method}")
             current_df = pd.read_csv(self.input_csvs_to_compare[method])
             current_df.columns = current_df.columns.str.lower()
             self.all_subject_ids.update(current_df["subjectid"].unique())
@@ -182,16 +216,19 @@ def combine_csvs_and_scores(self) -> None:
                 for metric in metrics_columns:
                     score = row[metric]
                     if not pd.api.types.is_number(score):
-                        raise ValueError(
+                        error_msg = (
                             f"Invalid score for method '{method}', subject '{row['subjectid']}', "
                             f"metric '{metric}'. Expected a number, but got '{score}'."
                         )
+                        logger.error(error_msg)
+                        raise ValueError(error_msg)
                     current_df_flattened[f"{metric}_{row['subjectid']}"] = score
             current_df_flattened = pd.DataFrame(current_df_flattened, index=[0])
             self.metrics_per_subject = pd.concat(
                 [self.metrics_per_subject, current_df_flattened], axis=0
             ).reset_index(drop=True)
 
+        print("Finished combining CSVs and scores.")
         self.rank_methods()
 
     def rank_methods(self) -> None:
@@ -204,6 +241,7 @@ def rank_methods(self) -> None:
         get a cumulative rank for each method, which is then used to determine
         the final ranking.
         """
+        print("Ranking methods...")
         # Rank the methods for each metric-subject combination
         # Use ascending=False so that higher values get better ranks (original logic)
         ranks_per_metric_detailed = self.metrics_per_subject.rank(
@@ -215,6 +253,7 @@ def rank_methods(self) -> None:
         for metric in metrics_for_reversal_lower:
             for column in ranks_per_metric_detailed.columns:
                 if metric in column:
+                    print(f"Reversing ranks for metric: {metric} in column: {column}")
                     ranks_per_metric_detailed[column] = (
                         ranks_per_metric_detailed[column].max()
                         + 1
@@ -245,7 +284,7 @@ def rank_methods(self) -> None:
 
         # Create a DataFrame from the dictionary of per-subject average ranks
         per_subject_avg_ranks = pd.DataFrame(subject_avg_rank_data)
-
+        self.per_subject_avg_ranks = per_subject_avg_ranks
         # Calculate the cumulative and final ranks
         cumulative_rank_column = per_subject_avg_ranks.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
@@ -259,11 +298,10 @@ def rank_methods(self) -> None:
                 final_rank_column.rename("final_rank"),
                 cumulative_rank_column.rename("cumulative_rank"),
                 per_subject_avg_ranks,
-                ranks_per_metric_detailed,
             ],
             axis=1,
         )
-
+        print("Finished ranking methods.")
         # Perform the permutation test to determine the statistical significance
         self.perform_permutation_test()
 
@@ -272,6 +310,7 @@ def perform_permutation_test(self) -> None:
         Performs a permutation test to determine the statistical significance of the
         ranks. This test is performed on the detailed rank data.
         """
+        print("Performing permutation test...")
         n_methods = len(self.ranks_per_metric)
         self.pvals = np.zeros((n_methods, n_methods))
         ranks_per_metric_sorted = self.ranks_per_metric.sort_values(
@@ -279,7 +318,9 @@ def perform_permutation_test(self) -> None:
         ).reset_index(drop=True)
 
         # Select only the detailed rank columns for the test
-        ranks_per_metric_sanitized = ranks_per_metric_sorted[self.detailed_rank_columns]
+        ranks_per_metric_sanitized = ranks_per_metric_sorted[
+            self.per_subject_avg_ranks.columns
+        ]
 
         # Use a process pool to parallelize the p-value calculations
         with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
@@ -289,7 +330,11 @@ def perform_permutation_test(self) -> None:
                     arr_i = ranks_per_metric_sanitized.iloc[i].to_numpy()
                     arr_j = ranks_per_metric_sanitized.iloc[j].to_numpy()
                     future = executor.submit(
-                        _calculate_pval_for_pair, arr_i, arr_j, self.n_iterations
+                        _calculate_pval_for_pair,
+                        arr_i,
+                        arr_j,
+                        self.n_iterations,
+                        self.log_permutations,
                     )
                     future_to_indices[future] = (i, j)
 
@@ -308,7 +353,7 @@ def perform_permutation_test(self) -> None:
                     self.pvals[i, j] = formatted_pval
                     self.pvals[j, i] = formatted_pval
                 except Exception as exc:
-                    print(f"Pair ({i}, {j}) generated an exception: {exc}")
+                    logger.error(f"Pair ({i}, {j}) generated an exception: {exc}")
 
         # Create a DataFrame from the p-values
         self.pvals_df = pd.DataFrame(
@@ -316,6 +361,7 @@ def perform_permutation_test(self) -> None:
             columns=ranks_per_metric_sorted["method"],
             index=ranks_per_metric_sorted["method"],
         )
+        print("Finished permutation test.")
 
     def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
@@ -329,6 +375,7 @@ def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
                 - The final rankings of the methods.
                 - The p-values from the permutation test.
         """
+        print("Retrieving rankings and p-values.")
         ranks_df = self.ranks_per_metric.sort_values(by="final_rank").reset_index(
             drop=True
         )
@@ -339,5 +386,5 @@ def get_rankings_and_pvals(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
             columns=self.pvals_df.columns,
             index=self.pvals_df.index,
         ).reindex(index=ranks_df["method"], columns=ranks_df["method"])
-
+        print("Successfully retrieved rankings and p-values.")
         return ranks_df, pvals_upper_df

From 80ab2be1a52ae835dc315b10ee9291bd64893b12 Mon Sep 17 00:00:00 2001
From: Siddhesh Thakur <sid.cre8er@gmail.com>
Date: Thu, 4 Sep 2025 21:21:47 -0400
Subject: [PATCH 11/11] code working as expected

---
 pyranker/ranker.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pyranker/ranker.py b/pyranker/ranker.py
index 4d383af..9d7817e 100644
--- a/pyranker/ranker.py
+++ b/pyranker/ranker.py
@@ -236,9 +236,9 @@ def rank_methods(self) -> None:
         Ranks the methods based on the metrics using a two-step aggregation process.
 
         First, it ranks the methods for each metric and subject combination.
-        Then, it calculates the average rank for each method across all metrics
-        for each subject. Finally, it sums up the per-subject average ranks to
-        get a cumulative rank for each method, which is then used to determine
+        Then, it calculates the cumulative rank for each method across all metrics
+        for each subject. Finally, it sums up the per-subject cumulative ranks to
+        get a total cumulative rank for each method, which is then used to determine
         the final ranking.
         """
         print("Ranking methods...")
@@ -268,25 +268,25 @@ def rank_methods(self) -> None:
         verification_df.to_csv(verification_path, index=False)
         print(f"Saved detailed verification ranks to: {verification_path}")
 
-        # Create a dictionary to hold the per-subject average ranks
-        subject_avg_rank_data = {}
+        # Create a dictionary to hold the per-subject cumulative ranks
+        subject_cumulative_rank_data = {}
         subject_ids_sorted = sorted(list(self.all_subject_ids))
 
-        # Calculate the average rank for each method for each subject
+        # Calculate the cumulative rank for each method for each subject
         for subject in subject_ids_sorted:
             subject_cols = [
                 col for col in self.detailed_rank_columns if col.endswith(f"_{subject}")
             ]
             if subject_cols:
-                subject_avg_rank_data[f"{subject}_avg_rank"] = (
-                    ranks_per_metric_detailed[subject_cols].mean(axis=1)
+                subject_cumulative_rank_data[f"{subject}_cumulative_rank"] = (
+                    ranks_per_metric_detailed[subject_cols].sum(axis=1)
                 )
 
-        # Create a DataFrame from the dictionary of per-subject average ranks
-        per_subject_avg_ranks = pd.DataFrame(subject_avg_rank_data)
-        self.per_subject_avg_ranks = per_subject_avg_ranks
+        # Create a DataFrame from the dictionary of per-subject cumulative ranks
+        per_subject_cumulative_ranks = pd.DataFrame(subject_cumulative_rank_data)
+        self.per_subject_cumulative_ranks = per_subject_cumulative_ranks
         # Calculate the cumulative and final ranks
-        cumulative_rank_column = per_subject_avg_ranks.sum(axis=1)
+        cumulative_rank_column = per_subject_cumulative_ranks.sum(axis=1)
         final_rank_column = cumulative_rank_column.rank(
             method="average", ascending=True
         )
@@ -297,7 +297,7 @@ def rank_methods(self) -> None:
                 self.metrics_per_subject["method"],
                 final_rank_column.rename("final_rank"),
                 cumulative_rank_column.rename("cumulative_rank"),
-                per_subject_avg_ranks,
+                per_subject_cumulative_ranks,
             ],
             axis=1,
         )
@@ -319,7 +319,7 @@ def perform_permutation_test(self) -> None:
 
         # Select only the detailed rank columns for the test
         ranks_per_metric_sanitized = ranks_per_metric_sorted[
-            self.per_subject_avg_ranks.columns
+            self.per_subject_cumulative_ranks.columns
         ]
 
         # Use a process pool to parallelize the p-value calculations