From f4e60ed071b3810a6be2d1a90131ceaa0284a299 Mon Sep 17 00:00:00 2001 From: ArtPoon Date: Tue, 16 Jul 2024 10:13:46 -0400 Subject: [PATCH 1/3] fixing log power calc does not resolve #93 --- openrdp/common.py | 4 ++++ openrdp/rdp.py | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/openrdp/common.py b/openrdp/common.py index 12b5d7c..731b420 100644 --- a/openrdp/common.py +++ b/openrdp/common.py @@ -54,6 +54,7 @@ def merge_breakpoints(raw_results, max_pvalue=100): return results + def read_fasta(handle): """ Converts a FASTA formatted file to a tuple containing a list of headers and sequences @@ -241,6 +242,9 @@ def next(self): class Triplet: def __init__(self, seqs, names, idxs=None): + """ + :param seqs: numpy.array + """ self.sequences = seqs self.names = names self.idxs = idxs diff --git a/openrdp/rdp.py b/openrdp/rdp.py index f65df11..f228287 100644 --- a/openrdp/rdp.py +++ b/openrdp/rdp.py @@ -104,9 +104,10 @@ def pairwise_identity(reg_ab, reg_bc, reg_ac): def execute(self, triplet): """ Performs RDP detection method for one triplet of sequences + :param triplet: object of class Triplet :return: the coordinates of the potential recombinant region and the p_value """ - G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets + G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets # Get the three pairs of sequences ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]]) @@ -164,14 +165,16 @@ def execute(self, triplet): id_in_seq += 1 p = id_in_seq / triplet.sequences.shape[1] - # Calculate p_value + # Calculate p_value as binomial probability val = 0 log_n_fact = np.sum(np.log(np.arange(1, n+1))) # Convert to log space to prevent integer overflow for i in range(m, n): log_i_fact = np.sum(np.log(np.arange(1, i+1))) log_ni_fact = np.sum(np.log(np.arange(1, n-i+1))) with np.errstate(divide='ignore'): # Ignore floating point error - val += np.math.exp((log_n_fact - (log_i_fact + log_ni_fact)) + np.log(p**n) + np.log((1-p)**(n-i))) + val += np.math.exp( + (log_n_fact - (log_i_fact + log_ni_fact)) + n * np.log(p) + (n-i) * np.log(1-p) + ) uncorr_pvalue = (len_trp / n) * val corr_p_value = G * uncorr_pvalue From fcc54abb269056a508e69b020778d4215b0bfbac Mon Sep 17 00:00:00 2001 From: ArtPoon Date: Tue, 23 Jul 2024 14:14:01 -0400 Subject: [PATCH 2/3] - starting work on #100 - completing some docstrings - identified some unused functions / variables - pairwise_identity now returns binary vectors of identity instead of percentages - this makes later steps more efficient --- openrdp/common.py | 12 +++++--- openrdp/rdp.py | 78 +++++++++++++++++++++++++---------------------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/openrdp/common.py b/openrdp/common.py index 731b420..8a33c5c 100644 --- a/openrdp/common.py +++ b/openrdp/common.py @@ -1,11 +1,11 @@ from itertools import combinations, product - import numpy as np from scipy.stats import chi2_contingency from scipy.stats import pearsonr import copy import sys + def merge_breakpoints(raw_results, max_pvalue=100): """ took from siscan @@ -267,21 +267,23 @@ def get_trp_names(self): def remove_uninformative_sites(self): """ - Remove sites that are all the same or all different + Remove sites that are all the same or all different, leaving + phylogenetically-informative sites. + :return: numpy.Array, alignment containing only informative sites + list, integer indices of informative sites in original alignment + list, integer indices of non-informative sites in original alignment """ infor_sites = [] uninfor_sites = [] - # Find positions of sites that are all the same sites or all sites that are different for i in range(self.sequences.shape[1]): col = self.sequences[:, i] if np.unique(col).shape[0] == 2: - infor_sites.append(i) + infor_sites.append(i) # store index else: uninfor_sites.append(i) # Build "new alignment" new_aln = self.sequences[:, infor_sites] - return new_aln, infor_sites, uninfor_sites def remove_monomorphic_sites(self): diff --git a/openrdp/rdp.py b/openrdp/rdp.py index f228287..31e2dd4 100644 --- a/openrdp/rdp.py +++ b/openrdp/rdp.py @@ -9,11 +9,22 @@ class RdpMethod: Executes RDP method """ def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100, - settings=None, ref_align=None, verbose=False, max_pavlues = 100): + settings=None, ref_align=None, verbose=False, max_pvalues = 100): + """ + @param align: Numpy array of sequences as lists + @param win_size: window length + @param reference: str, user-specified sequence to fix in triplets (NOT IMPLEMENTED) + @param min_id: int, minimum sequence identity as a percentage (1-100) + FIXME: why not float? + @param max_id: int, maximum sequence identity as a percentage (1-100) + @param settings: dict, optionally initialize member variables + @param ref_align: unused + @param verbose: unused + @param max_pvalues: unused + """ if settings: self.set_options_from_config(settings) self.validate_options() - else: self.win_size = win_size self.reference = reference @@ -21,7 +32,7 @@ def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100, self.max_id = max_id # TODO check for valid p-val since no innate max_pvalue, output has given values greater than 31 before - self.max_pvalues = max_pavlues + self.max_pvalues = max_pvalues self.align = align self.raw_results = [] self.results = [] @@ -59,9 +70,10 @@ def validate_options(self): def triplet_identity(self, triplets): """ - Calculate the percent identity of each triplet and + Calculate the percent identity of each triplet and validate (UNUSED) :param triplets: a list of all triplets - :return: triplets whose identity is greater than the minimum identity and less than the maximum identity + :return: triplets whose identity is greater than the minimum identity and less than the + maximum identity """ trps = [] for trp in triplets: @@ -80,26 +92,19 @@ def triplet_identity(self, triplets): def pairwise_identity(reg_ab, reg_bc, reg_ac): """ Calculate the pairwise identity of each sequence within the triplet - :param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B - :param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C - :param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C - :return: the identity of each pair of sequences in the triplet - """ - a_b, b_c, a_c = 0, 0, 0 - for j in range(reg_ab.shape[1]): - if reg_ab[0, j] == reg_ab[1, j]: - a_b += 1 - if reg_bc[0, j] == reg_bc[1, j]: - b_c += 1 - if reg_ac[0, j] == reg_ac[1, j]: - a_c += 1 - - percent_identity_ab = a_b / reg_ab.shape[1] * 100 - percent_identity_bc = b_c / reg_bc.shape[1] * 100 - percent_identity_ac = a_c / reg_ac.shape[1] * 100 - - return percent_identity_ab, percent_identity_bc, percent_identity_ac + :param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B + :param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C + :param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C + :return: list, binary vector of identity between sequences A and B + list, binary vector of identity between sequences B and C + list, binary vector of identity between sequences A and C + """ + # FIXME: all sequences should probably have the same length... + a_b = [int(reg_ab[0, j] == reg_ab[1, j]) for i in range(reg_ab.shape[1])] + b_c = [int(reg_bc[0, j] == reg_bc[1, j]) for i in range(reg_bc.shape[1])] + a_c = [int(reg_ac[0, j] == reg_ac[1, j]) for i in range(reg_ac.shape[1])] + return a_b, b_c, a_c def execute(self, triplet): """ @@ -109,36 +114,35 @@ def execute(self, triplet): """ G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets - # Get the three pairs of sequences + # Get the three pairs of sequences from alignment of phylogenetically informative sites ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]]) bc = np.array([triplet.info_sites_align[1], triplet.info_sites_align[2]]) ac = np.array([triplet.info_sites_align[0], triplet.info_sites_align[2]]) ab_id, bc_id, ac_id = self.pairwise_identity(ab, bc, ac) + percent_ident = [sum(pair) / len(pair) * 100 for pair in [ab_id, bc_id, ac_id]] # Include only triplets whose percent identity is in the valid range - if self.min_id < ab_id < self.max_id and self.min_id < bc_id < self.max_id and self.min_id < ac_id < self.max_id: - len_trp = triplet.info_sites_align.shape[1] + if min(percent_ident) > self.min_id and max(percent_ident) < self.max_id: + len_trp = triplet.info_sites_align.shape[1] # sequence length # 2. Sliding window over subsequence and calculate average percent identity at each position recombinant_regions = '' # Recombinant regions denoted by ones coord = [] for i in range(len_trp - self.win_size): - reg_ab = ab[:, i: self.win_size + i] - reg_bc = bc[:, i: self.win_size + i] - reg_ac = ac[:, i: self.win_size + i] - # Calculate percent identity in each window - percent_identity_ab, percent_identity_bc, percent_identity_ac = self.pairwise_identity(reg_ab, reg_bc, reg_ac) + pid_ab = sum(ab_id[:, i: self.win_size + i]) / self.win_size * 100 + pid_bc = sum(bc_id[:, i: self.win_size + i]) / self.win_size * 100 + pid_ac = sum(ac_id[:, i: self.win_size + i]) / self.win_size * 100 - # Identify recombinant regions - if percent_identity_ac > percent_identity_ab or percent_identity_bc > percent_identity_ab: + # Identify potential recombinant regions + if pid_ac > pid_ab or pid_bc > pid_ab: recombinant_regions += "1" - coord.append(i) + coord.append(i) # FIXME: UNUSED else: recombinant_regions += "0" - # 3. Record significance of events + # 3. locate runs of recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)] # Convert coordinates from window-level to alignment-level and record number of windows @@ -186,4 +190,4 @@ def execute(self, triplet): rec_name, parents = identify_recombinant(triplet, coord) self.raw_results.append((rec_name, parents, *coord, corr_p_value)) - return + return # FIXME: why is this still here? From a3c8142998ee25e8e68f18f991156030d558bedb Mon Sep 17 00:00:00 2001 From: ArtPoon Date: Tue, 23 Jul 2024 16:58:03 -0400 Subject: [PATCH 3/3] putative fix for #93 --- openrdp/common.py | 5 +- openrdp/rdp.py | 125 +++++++++++++++++++++------------------------- 2 files changed, 62 insertions(+), 68 deletions(-) diff --git a/openrdp/common.py b/openrdp/common.py index 8a33c5c..e062d94 100644 --- a/openrdp/common.py +++ b/openrdp/common.py @@ -154,7 +154,10 @@ def identify_recombinant(trp, aln_pos): """ Find the most likely recombinant sequence using the PhPr method described in the RDP5 documentation and Weiler GF (1998) Phylogenetic profiles: A graphical method for detecting genetic recombinations - in homologous sequences. Mol Biol Evol 15: 326–335 + in homologous sequences. Mol Biol Evol 15: 326-335 + + :param trp: object of class Triplet + :param aln_pos: tuple, :return: name of the recombinant sequence and the names of the parental sequences """ upstream_dists = [] diff --git a/openrdp/rdp.py b/openrdp/rdp.py index 31e2dd4..41b33d8 100644 --- a/openrdp/rdp.py +++ b/openrdp/rdp.py @@ -2,6 +2,8 @@ from .common import identify_recombinant import numpy as np from itertools import combinations +from scipy.stats import binom +import math class RdpMethod: @@ -52,6 +54,7 @@ def validate_options(self): """ Check if the options from the config file are valid If the options are invalid, the default value will be used instead + FIXME: I think this should return a True/False value """ if self.reference == 'None': self.reference = None @@ -101,18 +104,21 @@ def pairwise_identity(reg_ab, reg_bc, reg_ac): list, binary vector of identity between sequences A and C """ # FIXME: all sequences should probably have the same length... - a_b = [int(reg_ab[0, j] == reg_ab[1, j]) for i in range(reg_ab.shape[1])] - b_c = [int(reg_bc[0, j] == reg_bc[1, j]) for i in range(reg_bc.shape[1])] - a_c = [int(reg_ac[0, j] == reg_ac[1, j]) for i in range(reg_ac.shape[1])] + a_b = [int(reg_ab[0, i] == reg_ab[1, i]) for i in range(reg_ab.shape[1])] + b_c = [int(reg_bc[0, i] == reg_bc[1, i]) for i in range(reg_bc.shape[1])] + a_c = [int(reg_ac[0, i] == reg_ac[1, i]) for i in range(reg_ac.shape[1])] return a_b, b_c, a_c - + + def execute(self, triplet): """ Performs RDP detection method for one triplet of sequences :param triplet: object of class Triplet :return: the coordinates of the potential recombinant region and the p_value """ - G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets + + # total number of triplets - to adjust for multiple comparisons? + G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Get the three pairs of sequences from alignment of phylogenetically informative sites ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]]) @@ -127,67 +133,52 @@ def execute(self, triplet): len_trp = triplet.info_sites_align.shape[1] # sequence length # 2. Sliding window over subsequence and calculate average percent identity at each position - recombinant_regions = '' # Recombinant regions denoted by ones - coord = [] + recombinant_regions = '' for i in range(len_trp - self.win_size): # Calculate percent identity in each window - pid_ab = sum(ab_id[:, i: self.win_size + i]) / self.win_size * 100 - pid_bc = sum(bc_id[:, i: self.win_size + i]) / self.win_size * 100 - pid_ac = sum(ac_id[:, i: self.win_size + i]) / self.win_size * 100 - - # Identify potential recombinant regions - if pid_ac > pid_ab or pid_bc > pid_ab: - recombinant_regions += "1" - coord.append(i) # FIXME: UNUSED - else: - recombinant_regions += "0" - - # 3. locate runs of - recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)] - - # Convert coordinates from window-level to alignment-level and record number of windows - coords = [] - for x, y in recomb_idx: - coords.append((triplet.info_sites[x], triplet.info_sites[y - 1])) - - for coord in coords: - n = coord[1] - coord[0] # Length of putative recombinant region - - if n > 0: - # m is the proportion of nts in common between either A or B and C in the recombinant region - nts_in_a = triplet.sequences[0][coord[0]: coord[1]] - nts_in_c = triplet.sequences[2][coord[0]: coord[1]] - m = 0 - for i in range(n): - if nts_in_a[i] == nts_in_c[i]: - m += 1 - - # p is the proportion of nts in common between either A or B and C in the entire subsequence - id_in_seq = 0 - for j in range(triplet.sequences.shape[1]): - if triplet.sequences[0][j] == triplet.sequences[2][j]: - id_in_seq += 1 - p = id_in_seq / triplet.sequences.shape[1] - - # Calculate p_value as binomial probability - val = 0 - log_n_fact = np.sum(np.log(np.arange(1, n+1))) # Convert to log space to prevent integer overflow - for i in range(m, n): - log_i_fact = np.sum(np.log(np.arange(1, i+1))) - log_ni_fact = np.sum(np.log(np.arange(1, n-i+1))) - with np.errstate(divide='ignore'): # Ignore floating point error - val += np.math.exp( - (log_n_fact - (log_i_fact + log_ni_fact)) + n * np.log(p) + (n-i) * np.log(1-p) - ) - - uncorr_pvalue = (len_trp / n) * val - corr_p_value = G * uncorr_pvalue - - else: - corr_p_value = 'NS' - - if corr_p_value != 'NS' and corr_p_value != 0.0: - rec_name, parents = identify_recombinant(triplet, coord) - self.raw_results.append((rec_name, parents, *coord, corr_p_value)) - - return # FIXME: why is this still here? + try: + pid_ab = sum(ab_id[i:(self.win_size+i)]) / self.win_size * 100 + except TypeError: + print(i) + print(self.win_size) + raise + pid_bc = sum(bc_id[i:(self.win_size+i)]) / self.win_size * 100 + pid_ac = sum(ac_id[i:(self.win_size+i)]) / self.win_size * 100 + + # Potential recombinant regions where % ident of A-C or B-C is higher than A-B + recombinant_regions += "1" if pid_ac > pid_ab or pid_bc > pid_ab else "0" + + # 3. locate runs of 1's + recomb_idx = [m.span() for m in re.finditer('1+', recombinant_regions)] + + # Convert coordinates from window-level to alignment-level and record number of windows + coords = [(triplet.info_sites[x], triplet.info_sites[y-1]) for x, y in recomb_idx] + + for left, right in coords: + N = right - left # length of putative recombinant region + if N <= 0: + continue + + # retrieve full sequences + seq_a = triplet.sequences[0] + seq_b = triplet.sequences[1] + seq_c = triplet.sequences[2] + + M = 0 # number of nts in common between (A or B) and C in this region + p = 0 # proportion of nts in common between (A or B) and C in the entire sequence + L = len(seq_a) # length of the entire sequence + for i in range(N): + if seq_a[i]==seq_c[i] or seq_b[i]==seq_c[i]: + p += 1 + if i >= left and i < right: + M += 1 # within putative recombinant region + p /= L + + # Calculate p_value as binomial survival function (1-CDF) + pvalue = math.exp(math.log(G) + math.log(L) - math.log(N) + + binom.logsf(M-1, n=N, p=p)) # RDP sums from M to N inclusive + + if pvalue != 0.0: + # FIXME: what's wrong with P=0? + rec_name, parents = identify_recombinant(triplet, [left, right]) + self.raw_results.append((rec_name, parents, left, right, pvalue))