From f4e60ed071b3810a6be2d1a90131ceaa0284a299 Mon Sep 17 00:00:00 2001
From: ArtPoon <artpoon@gmail.com>
Date: Tue, 16 Jul 2024 10:13:46 -0400
Subject: [PATCH 1/3] fixing log power calc does not resolve #93

---
 openrdp/common.py | 4 ++++
 openrdp/rdp.py    | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/openrdp/common.py b/openrdp/common.py
index 12b5d7c..731b420 100644
--- a/openrdp/common.py
+++ b/openrdp/common.py
@@ -54,6 +54,7 @@ def merge_breakpoints(raw_results, max_pvalue=100):
 
     return results
 
+
 def read_fasta(handle):
     """
     Converts a FASTA formatted file to a tuple containing a list of headers and sequences
@@ -241,6 +242,9 @@ def next(self):
 
 class Triplet:
     def __init__(self, seqs, names, idxs=None):
+        """
+        :param seqs:  numpy.array
+        """
         self.sequences = seqs
         self.names = names
         self.idxs = idxs
diff --git a/openrdp/rdp.py b/openrdp/rdp.py
index f65df11..f228287 100644
--- a/openrdp/rdp.py
+++ b/openrdp/rdp.py
@@ -104,9 +104,10 @@ def pairwise_identity(reg_ab, reg_bc, reg_ac):
     def execute(self, triplet):
         """
         Performs RDP detection method for one triplet of sequences
+        :param triplet:  object of class Triplet
         :return: the coordinates of the potential recombinant region and the p_value
         """
-        G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets
+        G = sum(1 for _ in combinations(range(self.align.shape[0]), 3))  # Number of triplets
 
         # Get the three pairs of sequences
         ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]])
@@ -164,14 +165,16 @@ def execute(self, triplet):
                             id_in_seq += 1
                     p = id_in_seq / triplet.sequences.shape[1]
 
-                    # Calculate p_value
+                    # Calculate p_value as binomial probability
                     val = 0
                     log_n_fact = np.sum(np.log(np.arange(1, n+1)))  # Convert to log space to prevent integer overflow
                     for i in range(m, n):
                         log_i_fact = np.sum(np.log(np.arange(1, i+1)))
                         log_ni_fact = np.sum(np.log(np.arange(1, n-i+1)))
                         with np.errstate(divide='ignore'):  # Ignore floating point error
-                            val += np.math.exp((log_n_fact - (log_i_fact + log_ni_fact)) + np.log(p**n) + np.log((1-p)**(n-i)))
+                            val += np.math.exp(
+                                (log_n_fact - (log_i_fact + log_ni_fact)) + n * np.log(p) + (n-i) * np.log(1-p)
+                            )
 
                     uncorr_pvalue = (len_trp / n) * val
                     corr_p_value = G * uncorr_pvalue

From fcc54abb269056a508e69b020778d4215b0bfbac Mon Sep 17 00:00:00 2001
From: ArtPoon <artpoon@gmail.com>
Date: Tue, 23 Jul 2024 14:14:01 -0400
Subject: [PATCH 2/3] - starting work on #100 - completing some docstrings -
 identified some unused functions / variables - pairwise_identity now returns
 binary vectors of identity instead of   percentages - this makes later steps
 more efficient

---
 openrdp/common.py | 12 +++++---
 openrdp/rdp.py    | 78 +++++++++++++++++++++++++----------------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/openrdp/common.py b/openrdp/common.py
index 731b420..8a33c5c 100644
--- a/openrdp/common.py
+++ b/openrdp/common.py
@@ -1,11 +1,11 @@
 from itertools import combinations, product
-
 import numpy as np
 from scipy.stats import chi2_contingency
 from scipy.stats import pearsonr
 import copy
 import sys
 
+
 def merge_breakpoints(raw_results, max_pvalue=100):
     """
     took from siscan
@@ -267,21 +267,23 @@ def get_trp_names(self):
 
     def remove_uninformative_sites(self):
         """
-        Remove sites that are all the same or all different
+        Remove sites that are all the same or all different, leaving 
+        phylogenetically-informative sites.
+        :return:  numpy.Array, alignment containing only informative sites
+                  list, integer indices of informative sites in original alignment
+                  list, integer indices of non-informative sites in original alignment
         """
         infor_sites = []
         uninfor_sites = []
-        # Find positions of sites that are all the same sites or all sites that are different
         for i in range(self.sequences.shape[1]):
             col = self.sequences[:, i]
             if np.unique(col).shape[0] == 2:
-                infor_sites.append(i)
+                infor_sites.append(i)  # store index
             else:
                 uninfor_sites.append(i)
 
         # Build "new alignment"
         new_aln = self.sequences[:, infor_sites]
-
         return new_aln, infor_sites, uninfor_sites
 
     def remove_monomorphic_sites(self):
diff --git a/openrdp/rdp.py b/openrdp/rdp.py
index f228287..31e2dd4 100644
--- a/openrdp/rdp.py
+++ b/openrdp/rdp.py
@@ -9,11 +9,22 @@ class RdpMethod:
     Executes RDP method
     """
     def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100,
-                 settings=None, ref_align=None, verbose=False, max_pavlues = 100):
+                 settings=None, ref_align=None, verbose=False, max_pvalues = 100):
+        """
+        @param align:  Numpy array of sequences as lists
+        @param win_size:  window length
+        @param reference:  str, user-specified sequence to fix in triplets (NOT IMPLEMENTED)
+        @param min_id:  int, minimum sequence identity as a percentage (1-100)
+                        FIXME: why not float?
+        @param max_id:  int, maximum sequence identity as a percentage (1-100)
+        @param settings:  dict, optionally initialize member variables
+        @param ref_align:  unused
+        @param verbose:  unused
+        @param max_pvalues:  unused
+        """
         if settings:
             self.set_options_from_config(settings)
             self.validate_options()
-
         else:
             self.win_size = win_size
             self.reference = reference
@@ -21,7 +32,7 @@ def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100,
             self.max_id = max_id
 
         # TODO check for valid p-val since no innate max_pvalue, output has given values greater than 31 before
-        self.max_pvalues = max_pavlues
+        self.max_pvalues = max_pvalues
         self.align = align
         self.raw_results = []
         self.results = []
@@ -59,9 +70,10 @@ def validate_options(self):
 
     def triplet_identity(self, triplets):
         """
-        Calculate the percent identity of each triplet and
+        Calculate the percent identity of each triplet and validate (UNUSED)
         :param triplets: a list of all triplets
-        :return: triplets whose identity is greater than the minimum identity and less than the maximum identity
+        :return: triplets whose identity is greater than the minimum identity and less than the 
+                 maximum identity
         """
         trps = []
         for trp in triplets:
@@ -80,26 +92,19 @@ def triplet_identity(self, triplets):
     def pairwise_identity(reg_ab, reg_bc, reg_ac):
         """
         Calculate the pairwise identity of each sequence within the triplet
-        :param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B
-        :param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C
-        :param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C
-        :return: the identity of each pair of sequences in the triplet
-        """
-        a_b, b_c, a_c = 0, 0, 0
 
-        for j in range(reg_ab.shape[1]):
-            if reg_ab[0, j] == reg_ab[1, j]:
-                a_b += 1
-            if reg_bc[0, j] == reg_bc[1, j]:
-                b_c += 1
-            if reg_ac[0, j] == reg_ac[1, j]:
-                a_c += 1
-
-        percent_identity_ab = a_b / reg_ab.shape[1] * 100
-        percent_identity_bc = b_c / reg_bc.shape[1] * 100
-        percent_identity_ac = a_c / reg_ac.shape[1] * 100
-
-        return percent_identity_ab, percent_identity_bc, percent_identity_ac
+        :param reg_ab:  matrix of size 2 x sequence_length that contains sequences A and B
+        :param reg_bc:  matrix of size 2 x sequence_length that contains sequences B and C
+        :param reg_ac:  matrix of size 2 x sequence_length that contains sequences A and C
+        :return:  list, binary vector of identity between sequences A and B
+                  list, binary vector of identity between sequences B and C
+                  list, binary vector of identity between sequences A and C
+        """
+        # FIXME: all sequences should probably have the same length...
+        a_b = [int(reg_ab[0, j] == reg_ab[1, j]) for i in range(reg_ab.shape[1])]
+        b_c = [int(reg_bc[0, j] == reg_bc[1, j]) for i in range(reg_bc.shape[1])]
+        a_c = [int(reg_ac[0, j] == reg_ac[1, j]) for i in range(reg_ac.shape[1])]
+        return a_b, b_c, a_c
 
     def execute(self, triplet):
         """
@@ -109,36 +114,35 @@ def execute(self, triplet):
         """
         G = sum(1 for _ in combinations(range(self.align.shape[0]), 3))  # Number of triplets
 
-        # Get the three pairs of sequences
+        # Get the three pairs of sequences from alignment of phylogenetically informative sites
         ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]])
         bc = np.array([triplet.info_sites_align[1], triplet.info_sites_align[2]])
         ac = np.array([triplet.info_sites_align[0], triplet.info_sites_align[2]])
 
         ab_id, bc_id, ac_id = self.pairwise_identity(ab, bc, ac)
+        percent_ident = [sum(pair) / len(pair) * 100 for pair in [ab_id, bc_id, ac_id]]
 
         # Include only triplets whose percent identity is in the valid range
-        if self.min_id < ab_id < self.max_id and self.min_id < bc_id < self.max_id and self.min_id < ac_id < self.max_id:
-            len_trp = triplet.info_sites_align.shape[1]
+        if min(percent_ident) > self.min_id and max(percent_ident) < self.max_id:
+            len_trp = triplet.info_sites_align.shape[1]  # sequence length
 
             # 2. Sliding window over subsequence and calculate average percent identity at each position
             recombinant_regions = ''  # Recombinant regions denoted by ones
             coord = []
             for i in range(len_trp - self.win_size):
-                reg_ab = ab[:, i: self.win_size + i]
-                reg_bc = bc[:, i: self.win_size + i]
-                reg_ac = ac[:, i: self.win_size + i]
-
                 # Calculate percent identity in each window
-                percent_identity_ab, percent_identity_bc, percent_identity_ac = self.pairwise_identity(reg_ab, reg_bc, reg_ac)
+                pid_ab = sum(ab_id[:, i: self.win_size + i]) / self.win_size * 100
+                pid_bc = sum(bc_id[:, i: self.win_size + i]) / self.win_size * 100
+                pid_ac = sum(ac_id[:, i: self.win_size + i]) / self.win_size * 100
 
-                # Identify recombinant regions
-                if percent_identity_ac > percent_identity_ab or percent_identity_bc > percent_identity_ab:
+                # Identify potential recombinant regions
+                if pid_ac > pid_ab or pid_bc > pid_ab:
                     recombinant_regions += "1"
-                    coord.append(i)
+                    coord.append(i)  # FIXME: UNUSED
                 else:
                     recombinant_regions += "0"
 
-            # 3. Record significance of events
+            # 3. locate runs of 
             recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)]
 
             # Convert coordinates from  window-level to alignment-level and record number of windows
@@ -186,4 +190,4 @@ def execute(self, triplet):
                     rec_name, parents = identify_recombinant(triplet, coord)
                     self.raw_results.append((rec_name, parents, *coord, corr_p_value))
 
-            return
+            return  # FIXME: why is this still here?

From a3c8142998ee25e8e68f18f991156030d558bedb Mon Sep 17 00:00:00 2001
From: ArtPoon <artpoon@gmail.com>
Date: Tue, 23 Jul 2024 16:58:03 -0400
Subject: [PATCH 3/3] putative fix for #93

---
 openrdp/common.py |   5 +-
 openrdp/rdp.py    | 125 +++++++++++++++++++++-------------------------
 2 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/openrdp/common.py b/openrdp/common.py
index 8a33c5c..e062d94 100644
--- a/openrdp/common.py
+++ b/openrdp/common.py
@@ -154,7 +154,10 @@ def identify_recombinant(trp, aln_pos):
     """
     Find the most likely recombinant sequence using the PhPr method described in the RDP5 documentation
     and Weiler GF (1998) Phylogenetic profiles: A graphical method for detecting genetic recombinations
-    in homologous sequences. Mol Biol Evol 15: 326–335
+    in homologous sequences. Mol Biol Evol 15: 326-335
+    
+    :param trp:  object of class Triplet
+    :param aln_pos:  tuple, 
     :return: name of the recombinant sequence and the names of the parental sequences
     """
     upstream_dists = []
diff --git a/openrdp/rdp.py b/openrdp/rdp.py
index 31e2dd4..41b33d8 100644
--- a/openrdp/rdp.py
+++ b/openrdp/rdp.py
@@ -2,6 +2,8 @@
 from .common import identify_recombinant
 import numpy as np
 from itertools import combinations
+from scipy.stats import binom
+import math
 
 
 class RdpMethod:
@@ -52,6 +54,7 @@ def validate_options(self):
         """
         Check if the options from the config file are valid
         If the options are invalid, the default value will be used instead
+        FIXME: I think this should return a True/False value
         """
         if self.reference == 'None':
             self.reference = None
@@ -101,18 +104,21 @@ def pairwise_identity(reg_ab, reg_bc, reg_ac):
                   list, binary vector of identity between sequences A and C
         """
         # FIXME: all sequences should probably have the same length...
-        a_b = [int(reg_ab[0, j] == reg_ab[1, j]) for i in range(reg_ab.shape[1])]
-        b_c = [int(reg_bc[0, j] == reg_bc[1, j]) for i in range(reg_bc.shape[1])]
-        a_c = [int(reg_ac[0, j] == reg_ac[1, j]) for i in range(reg_ac.shape[1])]
+        a_b = [int(reg_ab[0, i] == reg_ab[1, i]) for i in range(reg_ab.shape[1])]
+        b_c = [int(reg_bc[0, i] == reg_bc[1, i]) for i in range(reg_bc.shape[1])]
+        a_c = [int(reg_ac[0, i] == reg_ac[1, i]) for i in range(reg_ac.shape[1])]
         return a_b, b_c, a_c
-
+        
+        
     def execute(self, triplet):
         """
         Performs RDP detection method for one triplet of sequences
         :param triplet:  object of class Triplet
         :return: the coordinates of the potential recombinant region and the p_value
         """
-        G = sum(1 for _ in combinations(range(self.align.shape[0]), 3))  # Number of triplets
+        
+        # total number of triplets - to adjust for multiple comparisons?
+        G = sum(1 for _ in combinations(range(self.align.shape[0]), 3))
 
         # Get the three pairs of sequences from alignment of phylogenetically informative sites
         ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]])
@@ -127,67 +133,52 @@ def execute(self, triplet):
             len_trp = triplet.info_sites_align.shape[1]  # sequence length
 
             # 2. Sliding window over subsequence and calculate average percent identity at each position
-            recombinant_regions = ''  # Recombinant regions denoted by ones
-            coord = []
+            recombinant_regions = ''
             for i in range(len_trp - self.win_size):
                 # Calculate percent identity in each window
-                pid_ab = sum(ab_id[:, i: self.win_size + i]) / self.win_size * 100
-                pid_bc = sum(bc_id[:, i: self.win_size + i]) / self.win_size * 100
-                pid_ac = sum(ac_id[:, i: self.win_size + i]) / self.win_size * 100
-
-                # Identify potential recombinant regions
-                if pid_ac > pid_ab or pid_bc > pid_ab:
-                    recombinant_regions += "1"
-                    coord.append(i)  # FIXME: UNUSED
-                else:
-                    recombinant_regions += "0"
-
-            # 3. locate runs of 
-            recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)]
-
-            # Convert coordinates from  window-level to alignment-level and record number of windows
-            coords = []
-            for x, y in recomb_idx:
-                coords.append((triplet.info_sites[x], triplet.info_sites[y - 1]))
-
-            for coord in coords:
-                n = coord[1] - coord[0]     # Length of putative recombinant region
-
-                if n > 0:
-                    # m is the proportion of nts in common between either A or B and C in the recombinant region
-                    nts_in_a = triplet.sequences[0][coord[0]: coord[1]]
-                    nts_in_c = triplet.sequences[2][coord[0]: coord[1]]
-                    m = 0
-                    for i in range(n):
-                        if nts_in_a[i] == nts_in_c[i]:
-                            m += 1
-
-                    # p is the proportion of nts in common between either A or B and C in the entire subsequence
-                    id_in_seq = 0
-                    for j in range(triplet.sequences.shape[1]):
-                        if triplet.sequences[0][j] == triplet.sequences[2][j]:
-                            id_in_seq += 1
-                    p = id_in_seq / triplet.sequences.shape[1]
-
-                    # Calculate p_value as binomial probability
-                    val = 0
-                    log_n_fact = np.sum(np.log(np.arange(1, n+1)))  # Convert to log space to prevent integer overflow
-                    for i in range(m, n):
-                        log_i_fact = np.sum(np.log(np.arange(1, i+1)))
-                        log_ni_fact = np.sum(np.log(np.arange(1, n-i+1)))
-                        with np.errstate(divide='ignore'):  # Ignore floating point error
-                            val += np.math.exp(
-                                (log_n_fact - (log_i_fact + log_ni_fact)) + n * np.log(p) + (n-i) * np.log(1-p)
-                            )
-
-                    uncorr_pvalue = (len_trp / n) * val
-                    corr_p_value = G * uncorr_pvalue
-
-                else:
-                    corr_p_value = 'NS'
-
-                if corr_p_value != 'NS' and corr_p_value != 0.0:
-                    rec_name, parents = identify_recombinant(triplet, coord)
-                    self.raw_results.append((rec_name, parents, *coord, corr_p_value))
-
-            return  # FIXME: why is this still here?
+                try:
+                    pid_ab = sum(ab_id[i:(self.win_size+i)]) / self.win_size * 100
+                except TypeError:
+                    print(i)
+                    print(self.win_size)
+                    raise
+                pid_bc = sum(bc_id[i:(self.win_size+i)]) / self.win_size * 100
+                pid_ac = sum(ac_id[i:(self.win_size+i)]) / self.win_size * 100
+
+                # Potential recombinant regions where % ident of A-C or B-C is higher than A-B
+                recombinant_regions += "1" if pid_ac > pid_ab or pid_bc > pid_ab else "0"
+
+            # 3. locate runs of 1's
+            recomb_idx = [m.span() for m in re.finditer('1+', recombinant_regions)]
+
+            # Convert coordinates from window-level to alignment-level and record number of windows
+            coords = [(triplet.info_sites[x], triplet.info_sites[y-1]) for x, y in recomb_idx]
+            
+            for left, right in coords:
+                N = right - left  # length of putative recombinant region
+                if N <= 0:
+                    continue
+                
+                # retrieve full sequences
+                seq_a = triplet.sequences[0]
+                seq_b = triplet.sequences[1]
+                seq_c = triplet.sequences[2]
+                
+                M = 0  # number of nts in common between (A or B) and C in this region
+                p = 0  # proportion of nts in common between (A or B) and C in the entire sequence
+                L = len(seq_a)  # length of the entire sequence
+                for i in range(N):
+                    if seq_a[i]==seq_c[i] or seq_b[i]==seq_c[i]:
+                        p += 1
+                        if i >= left and i < right:
+                            M += 1  # within putative recombinant region
+                p /= L
+                
+                # Calculate p_value as binomial survival function (1-CDF)
+                pvalue = math.exp(math.log(G) + math.log(L) - math.log(N) + 
+                                  binom.logsf(M-1, n=N, p=p))  # RDP sums from M to N inclusive
+
+                if pvalue != 0.0:
+                    # FIXME: what's wrong with P=0?
+                    rec_name, parents = identify_recombinant(triplet, [left, right])
+                    self.raw_results.append((rec_name, parents, left, right, pvalue))