diff --git a/openrdp/common.py b/openrdp/common.py index 12b5d7c..e062d94 100644 --- a/openrdp/common.py +++ b/openrdp/common.py @@ -1,11 +1,11 @@ from itertools import combinations, product - import numpy as np from scipy.stats import chi2_contingency from scipy.stats import pearsonr import copy import sys + def merge_breakpoints(raw_results, max_pvalue=100): """ took from siscan @@ -54,6 +54,7 @@ def merge_breakpoints(raw_results, max_pvalue=100): return results + def read_fasta(handle): """ Converts a FASTA formatted file to a tuple containing a list of headers and sequences @@ -153,7 +154,10 @@ def identify_recombinant(trp, aln_pos): """ Find the most likely recombinant sequence using the PhPr method described in the RDP5 documentation and Weiler GF (1998) Phylogenetic profiles: A graphical method for detecting genetic recombinations - in homologous sequences. Mol Biol Evol 15: 326–335 + in homologous sequences. Mol Biol Evol 15: 326-335 + + :param trp: object of class Triplet + :param aln_pos: tuple, :return: name of the recombinant sequence and the names of the parental sequences """ upstream_dists = [] @@ -241,6 +245,9 @@ def next(self): class Triplet: def __init__(self, seqs, names, idxs=None): + """ + :param seqs: numpy.array + """ self.sequences = seqs self.names = names self.idxs = idxs @@ -263,21 +270,23 @@ def get_trp_names(self): def remove_uninformative_sites(self): """ - Remove sites that are all the same or all different + Remove sites that are all the same or all different, leaving + phylogenetically-informative sites. + :return: numpy.Array, alignment containing only informative sites + list, integer indices of informative sites in original alignment + list, integer indices of non-informative sites in original alignment """ infor_sites = [] uninfor_sites = [] - # Find positions of sites that are all the same sites or all sites that are different for i in range(self.sequences.shape[1]): col = self.sequences[:, i] if np.unique(col).shape[0] == 2: - infor_sites.append(i) + infor_sites.append(i) # store index else: uninfor_sites.append(i) # Build "new alignment" new_aln = self.sequences[:, infor_sites] - return new_aln, infor_sites, uninfor_sites def remove_monomorphic_sites(self): diff --git a/openrdp/rdp.py b/openrdp/rdp.py index f65df11..41b33d8 100644 --- a/openrdp/rdp.py +++ b/openrdp/rdp.py @@ -2,6 +2,8 @@ from .common import identify_recombinant import numpy as np from itertools import combinations +from scipy.stats import binom +import math class RdpMethod: @@ -9,11 +11,22 @@ class RdpMethod: Executes RDP method """ def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100, - settings=None, ref_align=None, verbose=False, max_pavlues = 100): + settings=None, ref_align=None, verbose=False, max_pvalues = 100): + """ + @param align: Numpy array of sequences as lists + @param win_size: window length + @param reference: str, user-specified sequence to fix in triplets (NOT IMPLEMENTED) + @param min_id: int, minimum sequence identity as a percentage (1-100) + FIXME: why not float? + @param max_id: int, maximum sequence identity as a percentage (1-100) + @param settings: dict, optionally initialize member variables + @param ref_align: unused + @param verbose: unused + @param max_pvalues: unused + """ if settings: self.set_options_from_config(settings) self.validate_options() - else: self.win_size = win_size self.reference = reference @@ -21,7 +34,7 @@ def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100, self.max_id = max_id # TODO check for valid p-val since no innate max_pvalue, output has given values greater than 31 before - self.max_pvalues = max_pavlues + self.max_pvalues = max_pvalues self.align = align self.raw_results = [] self.results = [] @@ -41,6 +54,7 @@ def validate_options(self): """ Check if the options from the config file are valid If the options are invalid, the default value will be used instead + FIXME: I think this should return a True/False value """ if self.reference == 'None': self.reference = None @@ -59,9 +73,10 @@ def validate_options(self): def triplet_identity(self, triplets): """ - Calculate the percent identity of each triplet and + Calculate the percent identity of each triplet and validate (UNUSED) :param triplets: a list of all triplets - :return: triplets whose identity is greater than the minimum identity and less than the maximum identity + :return: triplets whose identity is greater than the minimum identity and less than the + maximum identity """ trps = [] for trp in triplets: @@ -80,107 +95,90 @@ def triplet_identity(self, triplets): def pairwise_identity(reg_ab, reg_bc, reg_ac): """ Calculate the pairwise identity of each sequence within the triplet - :param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B - :param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C - :param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C - :return: the identity of each pair of sequences in the triplet - """ - a_b, b_c, a_c = 0, 0, 0 - - for j in range(reg_ab.shape[1]): - if reg_ab[0, j] == reg_ab[1, j]: - a_b += 1 - if reg_bc[0, j] == reg_bc[1, j]: - b_c += 1 - if reg_ac[0, j] == reg_ac[1, j]: - a_c += 1 - - percent_identity_ab = a_b / reg_ab.shape[1] * 100 - percent_identity_bc = b_c / reg_bc.shape[1] * 100 - percent_identity_ac = a_c / reg_ac.shape[1] * 100 - - return percent_identity_ab, percent_identity_bc, percent_identity_ac + :param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B + :param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C + :param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C + :return: list, binary vector of identity between sequences A and B + list, binary vector of identity between sequences B and C + list, binary vector of identity between sequences A and C + """ + # FIXME: all sequences should probably have the same length... + a_b = [int(reg_ab[0, i] == reg_ab[1, i]) for i in range(reg_ab.shape[1])] + b_c = [int(reg_bc[0, i] == reg_bc[1, i]) for i in range(reg_bc.shape[1])] + a_c = [int(reg_ac[0, i] == reg_ac[1, i]) for i in range(reg_ac.shape[1])] + return a_b, b_c, a_c + + def execute(self, triplet): """ Performs RDP detection method for one triplet of sequences + :param triplet: object of class Triplet :return: the coordinates of the potential recombinant region and the p_value """ - G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets + + # total number of triplets - to adjust for multiple comparisons? + G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) - # Get the three pairs of sequences + # Get the three pairs of sequences from alignment of phylogenetically informative sites ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]]) bc = np.array([triplet.info_sites_align[1], triplet.info_sites_align[2]]) ac = np.array([triplet.info_sites_align[0], triplet.info_sites_align[2]]) ab_id, bc_id, ac_id = self.pairwise_identity(ab, bc, ac) + percent_ident = [sum(pair) / len(pair) * 100 for pair in [ab_id, bc_id, ac_id]] # Include only triplets whose percent identity is in the valid range - if self.min_id < ab_id < self.max_id and self.min_id < bc_id < self.max_id and self.min_id < ac_id < self.max_id: - len_trp = triplet.info_sites_align.shape[1] + if min(percent_ident) > self.min_id and max(percent_ident) < self.max_id: + len_trp = triplet.info_sites_align.shape[1] # sequence length # 2. Sliding window over subsequence and calculate average percent identity at each position - recombinant_regions = '' # Recombinant regions denoted by ones - coord = [] + recombinant_regions = '' for i in range(len_trp - self.win_size): - reg_ab = ab[:, i: self.win_size + i] - reg_bc = bc[:, i: self.win_size + i] - reg_ac = ac[:, i: self.win_size + i] - # Calculate percent identity in each window - percent_identity_ab, percent_identity_bc, percent_identity_ac = self.pairwise_identity(reg_ab, reg_bc, reg_ac) - - # Identify recombinant regions - if percent_identity_ac > percent_identity_ab or percent_identity_bc > percent_identity_ab: - recombinant_regions += "1" - coord.append(i) - else: - recombinant_regions += "0" - - # 3. Record significance of events - recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)] - - # Convert coordinates from window-level to alignment-level and record number of windows - coords = [] - for x, y in recomb_idx: - coords.append((triplet.info_sites[x], triplet.info_sites[y - 1])) - - for coord in coords: - n = coord[1] - coord[0] # Length of putative recombinant region - - if n > 0: - # m is the proportion of nts in common between either A or B and C in the recombinant region - nts_in_a = triplet.sequences[0][coord[0]: coord[1]] - nts_in_c = triplet.sequences[2][coord[0]: coord[1]] - m = 0 - for i in range(n): - if nts_in_a[i] == nts_in_c[i]: - m += 1 - - # p is the proportion of nts in common between either A or B and C in the entire subsequence - id_in_seq = 0 - for j in range(triplet.sequences.shape[1]): - if triplet.sequences[0][j] == triplet.sequences[2][j]: - id_in_seq += 1 - p = id_in_seq / triplet.sequences.shape[1] - - # Calculate p_value - val = 0 - log_n_fact = np.sum(np.log(np.arange(1, n+1))) # Convert to log space to prevent integer overflow - for i in range(m, n): - log_i_fact = np.sum(np.log(np.arange(1, i+1))) - log_ni_fact = np.sum(np.log(np.arange(1, n-i+1))) - with np.errstate(divide='ignore'): # Ignore floating point error - val += np.math.exp((log_n_fact - (log_i_fact + log_ni_fact)) + np.log(p**n) + np.log((1-p)**(n-i))) - - uncorr_pvalue = (len_trp / n) * val - corr_p_value = G * uncorr_pvalue - - else: - corr_p_value = 'NS' - - if corr_p_value != 'NS' and corr_p_value != 0.0: - rec_name, parents = identify_recombinant(triplet, coord) - self.raw_results.append((rec_name, parents, *coord, corr_p_value)) - - return + try: + pid_ab = sum(ab_id[i:(self.win_size+i)]) / self.win_size * 100 + except TypeError: + print(i) + print(self.win_size) + raise + pid_bc = sum(bc_id[i:(self.win_size+i)]) / self.win_size * 100 + pid_ac = sum(ac_id[i:(self.win_size+i)]) / self.win_size * 100 + + # Potential recombinant regions where % ident of A-C or B-C is higher than A-B + recombinant_regions += "1" if pid_ac > pid_ab or pid_bc > pid_ab else "0" + + # 3. locate runs of 1's + recomb_idx = [m.span() for m in re.finditer('1+', recombinant_regions)] + + # Convert coordinates from window-level to alignment-level and record number of windows + coords = [(triplet.info_sites[x], triplet.info_sites[y-1]) for x, y in recomb_idx] + + for left, right in coords: + N = right - left # length of putative recombinant region + if N <= 0: + continue + + # retrieve full sequences + seq_a = triplet.sequences[0] + seq_b = triplet.sequences[1] + seq_c = triplet.sequences[2] + + M = 0 # number of nts in common between (A or B) and C in this region + p = 0 # proportion of nts in common between (A or B) and C in the entire sequence + L = len(seq_a) # length of the entire sequence + for i in range(N): + if seq_a[i]==seq_c[i] or seq_b[i]==seq_c[i]: + p += 1 + if i >= left and i < right: + M += 1 # within putative recombinant region + p /= L + + # Calculate p_value as binomial survival function (1-CDF) + pvalue = math.exp(math.log(G) + math.log(L) - math.log(N) + + binom.logsf(M-1, n=N, p=p)) # RDP sums from M to N inclusive + + if pvalue != 0.0: + # FIXME: what's wrong with P=0? + rec_name, parents = identify_recombinant(triplet, [left, right]) + self.raw_results.append((rec_name, parents, left, right, pvalue))