Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions openrdp/common.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from itertools import combinations, product

import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr
import copy
import sys


def merge_breakpoints(raw_results, max_pvalue=100):
"""
took from siscan
Expand Down Expand Up @@ -54,6 +54,7 @@ def merge_breakpoints(raw_results, max_pvalue=100):

return results


def read_fasta(handle):
"""
Converts a FASTA formatted file to a tuple containing a list of headers and sequences
Expand Down Expand Up @@ -153,7 +154,10 @@ def identify_recombinant(trp, aln_pos):
"""
Find the most likely recombinant sequence using the PhPr method described in the RDP5 documentation
and Weiler GF (1998) Phylogenetic profiles: A graphical method for detecting genetic recombinations
in homologous sequences. Mol Biol Evol 15: 326–335
in homologous sequences. Mol Biol Evol 15: 326-335

:param trp: object of class Triplet
:param aln_pos: tuple,
:return: name of the recombinant sequence and the names of the parental sequences
"""
upstream_dists = []
Expand Down Expand Up @@ -241,6 +245,9 @@ def next(self):

class Triplet:
def __init__(self, seqs, names, idxs=None):
"""
:param seqs: numpy.array
"""
self.sequences = seqs
self.names = names
self.idxs = idxs
Expand All @@ -263,21 +270,23 @@ def get_trp_names(self):

def remove_uninformative_sites(self):
"""
Remove sites that are all the same or all different
Remove sites that are all the same or all different, leaving
phylogenetically-informative sites.
:return: numpy.Array, alignment containing only informative sites
list, integer indices of informative sites in original alignment
list, integer indices of non-informative sites in original alignment
"""
infor_sites = []
uninfor_sites = []
# Find positions of sites that are all the same sites or all sites that are different
for i in range(self.sequences.shape[1]):
col = self.sequences[:, i]
if np.unique(col).shape[0] == 2:
infor_sites.append(i)
infor_sites.append(i) # store index
else:
uninfor_sites.append(i)

# Build "new alignment"
new_aln = self.sequences[:, infor_sites]

return new_aln, infor_sites, uninfor_sites

def remove_monomorphic_sites(self):
Expand Down
180 changes: 89 additions & 91 deletions openrdp/rdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,39 @@
from .common import identify_recombinant
import numpy as np
from itertools import combinations
from scipy.stats import binom
import math


class RdpMethod:
"""
Executes RDP method
"""
def __init__(self, align, win_size=30, reference=None, min_id=0, max_id=100,
settings=None, ref_align=None, verbose=False, max_pavlues = 100):
settings=None, ref_align=None, verbose=False, max_pvalues = 100):
"""
@param align: Numpy array of sequences as lists
@param win_size: window length
@param reference: str, user-specified sequence to fix in triplets (NOT IMPLEMENTED)
@param min_id: int, minimum sequence identity as a percentage (1-100)
FIXME: why not float?
@param max_id: int, maximum sequence identity as a percentage (1-100)
@param settings: dict, optionally initialize member variables
@param ref_align: unused
@param verbose: unused
@param max_pvalues: unused
"""
if settings:
self.set_options_from_config(settings)
self.validate_options()

else:
self.win_size = win_size
self.reference = reference
self.min_id = min_id
self.max_id = max_id

# TODO check for valid p-val since no innate max_pvalue, output has given values greater than 31 before
self.max_pvalues = max_pavlues
self.max_pvalues = max_pvalues
self.align = align
self.raw_results = []
self.results = []
Expand All @@ -41,6 +54,7 @@ def validate_options(self):
"""
Check if the options from the config file are valid
If the options are invalid, the default value will be used instead
FIXME: I think this should return a True/False value
"""
if self.reference == 'None':
self.reference = None
Expand All @@ -59,9 +73,10 @@ def validate_options(self):

def triplet_identity(self, triplets):
"""
Calculate the percent identity of each triplet and
Calculate the percent identity of each triplet and validate (UNUSED)
:param triplets: a list of all triplets
:return: triplets whose identity is greater than the minimum identity and less than the maximum identity
:return: triplets whose identity is greater than the minimum identity and less than the
maximum identity
"""
trps = []
for trp in triplets:
Expand All @@ -80,107 +95,90 @@ def triplet_identity(self, triplets):
def pairwise_identity(reg_ab, reg_bc, reg_ac):
"""
Calculate the pairwise identity of each sequence within the triplet
:param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B
:param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C
:param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C
:return: the identity of each pair of sequences in the triplet
"""
a_b, b_c, a_c = 0, 0, 0

for j in range(reg_ab.shape[1]):
if reg_ab[0, j] == reg_ab[1, j]:
a_b += 1
if reg_bc[0, j] == reg_bc[1, j]:
b_c += 1
if reg_ac[0, j] == reg_ac[1, j]:
a_c += 1

percent_identity_ab = a_b / reg_ab.shape[1] * 100
percent_identity_bc = b_c / reg_bc.shape[1] * 100
percent_identity_ac = a_c / reg_ac.shape[1] * 100

return percent_identity_ab, percent_identity_bc, percent_identity_ac

:param reg_ab: matrix of size 2 x sequence_length that contains sequences A and B
:param reg_bc: matrix of size 2 x sequence_length that contains sequences B and C
:param reg_ac: matrix of size 2 x sequence_length that contains sequences A and C
:return: list, binary vector of identity between sequences A and B
list, binary vector of identity between sequences B and C
list, binary vector of identity between sequences A and C
"""
# FIXME: all sequences should probably have the same length...
a_b = [int(reg_ab[0, i] == reg_ab[1, i]) for i in range(reg_ab.shape[1])]
b_c = [int(reg_bc[0, i] == reg_bc[1, i]) for i in range(reg_bc.shape[1])]
a_c = [int(reg_ac[0, i] == reg_ac[1, i]) for i in range(reg_ac.shape[1])]
return a_b, b_c, a_c


def execute(self, triplet):
"""
Performs RDP detection method for one triplet of sequences
:param triplet: object of class Triplet
:return: the coordinates of the potential recombinant region and the p_value
"""
G = sum(1 for _ in combinations(range(self.align.shape[0]), 3)) # Number of triplets

# total number of triplets - to adjust for multiple comparisons?
G = sum(1 for _ in combinations(range(self.align.shape[0]), 3))

# Get the three pairs of sequences
# Get the three pairs of sequences from alignment of phylogenetically informative sites
ab = np.array([triplet.info_sites_align[0], triplet.info_sites_align[1]])
bc = np.array([triplet.info_sites_align[1], triplet.info_sites_align[2]])
ac = np.array([triplet.info_sites_align[0], triplet.info_sites_align[2]])

ab_id, bc_id, ac_id = self.pairwise_identity(ab, bc, ac)
percent_ident = [sum(pair) / len(pair) * 100 for pair in [ab_id, bc_id, ac_id]]

# Include only triplets whose percent identity is in the valid range
if self.min_id < ab_id < self.max_id and self.min_id < bc_id < self.max_id and self.min_id < ac_id < self.max_id:
len_trp = triplet.info_sites_align.shape[1]
if min(percent_ident) > self.min_id and max(percent_ident) < self.max_id:
len_trp = triplet.info_sites_align.shape[1] # sequence length

# 2. Sliding window over subsequence and calculate average percent identity at each position
recombinant_regions = '' # Recombinant regions denoted by ones
coord = []
recombinant_regions = ''
for i in range(len_trp - self.win_size):
reg_ab = ab[:, i: self.win_size + i]
reg_bc = bc[:, i: self.win_size + i]
reg_ac = ac[:, i: self.win_size + i]

# Calculate percent identity in each window
percent_identity_ab, percent_identity_bc, percent_identity_ac = self.pairwise_identity(reg_ab, reg_bc, reg_ac)

# Identify recombinant regions
if percent_identity_ac > percent_identity_ab or percent_identity_bc > percent_identity_ab:
recombinant_regions += "1"
coord.append(i)
else:
recombinant_regions += "0"

# 3. Record significance of events
recomb_idx = [(m.span()) for m in re.finditer('1+', recombinant_regions)]

# Convert coordinates from window-level to alignment-level and record number of windows
coords = []
for x, y in recomb_idx:
coords.append((triplet.info_sites[x], triplet.info_sites[y - 1]))

for coord in coords:
n = coord[1] - coord[0] # Length of putative recombinant region

if n > 0:
# m is the proportion of nts in common between either A or B and C in the recombinant region
nts_in_a = triplet.sequences[0][coord[0]: coord[1]]
nts_in_c = triplet.sequences[2][coord[0]: coord[1]]
m = 0
for i in range(n):
if nts_in_a[i] == nts_in_c[i]:
m += 1

# p is the proportion of nts in common between either A or B and C in the entire subsequence
id_in_seq = 0
for j in range(triplet.sequences.shape[1]):
if triplet.sequences[0][j] == triplet.sequences[2][j]:
id_in_seq += 1
p = id_in_seq / triplet.sequences.shape[1]

# Calculate p_value
val = 0
log_n_fact = np.sum(np.log(np.arange(1, n+1))) # Convert to log space to prevent integer overflow
for i in range(m, n):
log_i_fact = np.sum(np.log(np.arange(1, i+1)))
log_ni_fact = np.sum(np.log(np.arange(1, n-i+1)))
with np.errstate(divide='ignore'): # Ignore floating point error
val += np.math.exp((log_n_fact - (log_i_fact + log_ni_fact)) + np.log(p**n) + np.log((1-p)**(n-i)))

uncorr_pvalue = (len_trp / n) * val
corr_p_value = G * uncorr_pvalue

else:
corr_p_value = 'NS'

if corr_p_value != 'NS' and corr_p_value != 0.0:
rec_name, parents = identify_recombinant(triplet, coord)
self.raw_results.append((rec_name, parents, *coord, corr_p_value))

return
try:
pid_ab = sum(ab_id[i:(self.win_size+i)]) / self.win_size * 100
except TypeError:
print(i)
print(self.win_size)
raise
pid_bc = sum(bc_id[i:(self.win_size+i)]) / self.win_size * 100
pid_ac = sum(ac_id[i:(self.win_size+i)]) / self.win_size * 100

# Potential recombinant regions where % ident of A-C or B-C is higher than A-B
recombinant_regions += "1" if pid_ac > pid_ab or pid_bc > pid_ab else "0"

# 3. locate runs of 1's
recomb_idx = [m.span() for m in re.finditer('1+', recombinant_regions)]

# Convert coordinates from window-level to alignment-level and record number of windows
coords = [(triplet.info_sites[x], triplet.info_sites[y-1]) for x, y in recomb_idx]

for left, right in coords:
N = right - left # length of putative recombinant region
if N <= 0:
continue

# retrieve full sequences
seq_a = triplet.sequences[0]
seq_b = triplet.sequences[1]
seq_c = triplet.sequences[2]

M = 0 # number of nts in common between (A or B) and C in this region
p = 0 # proportion of nts in common between (A or B) and C in the entire sequence
L = len(seq_a) # length of the entire sequence
for i in range(N):
if seq_a[i]==seq_c[i] or seq_b[i]==seq_c[i]:
p += 1
if i >= left and i < right:
M += 1 # within putative recombinant region
p /= L

# Calculate p_value as binomial survival function (1-CDF)
pvalue = math.exp(math.log(G) + math.log(L) - math.log(N) +
binom.logsf(M-1, n=N, p=p)) # RDP sums from M to N inclusive

if pvalue != 0.0:
# FIXME: what's wrong with P=0?
rec_name, parents = identify_recombinant(triplet, [left, right])
self.raw_results.append((rec_name, parents, left, right, pvalue))