From 0a8601a7d8ceaa98e2801040158b2913e269a688 Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Thu, 28 May 2020 15:46:46 +0300 Subject: [PATCH 1/8] changing requierment modules for python2.7 support. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 569e62e..9b4d6d2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.74', + version='0.75', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ @@ -31,6 +31,6 @@ def readme(): license='MIT', packages=['RILseq'], install_requires=[ - 'scipy', 'numpy', 'pysam>=0.14.1', 'biopython'], + 'scipy', 'numpy', 'pysam>=0.14.1', 'biopython==1.76'], include_package_data=True, zip_safe=False) From 879ada99a986be86132c80d26a1afd23b5af3c36 Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Tue, 28 Jul 2020 08:13:05 +0300 Subject: [PATCH 2/8] Resolving some off by 1 issues when writing the table and when generating the gff file. --- RILseq/__init__.py | 4 ++-- RILseq/ecocyc_parser.py | 4 ++-- setup.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/RILseq/__init__.py b/RILseq/__init__.py index 213177a..a80aeb8 100644 --- a/RILseq/__init__.py +++ b/RILseq/__init__.py @@ -719,13 +719,13 @@ def write_reads_table( read1_chrn = chrnames_bam[read1_reads[rname].reference_id] read2_chrn = chrnames_bam[read2_reads[rname].reference_id] if not read2_reads[rname].is_reverse: - end2_pos = read2_reads[rname].reference_start+read2_reads[rname].query_alignment_length-1 + end2_pos = read2_reads[rname].reference_start+read2_reads[rname].reference_length-1 end2_str = '+' else: end2_pos = read2_reads[rname].reference_start end2_str = '-' if read1_reads[rname].is_reverse: - end1_pos = read1_reads[rname].reference_start+read1_reads[rname].query_alignment_length-1 + end1_pos = read1_reads[rname].reference_start+read1_reads[rname].reference_length-1 end1_str = '-' else: end1_pos = read1_reads[rname].reference_start diff --git a/RILseq/ecocyc_parser.py b/RILseq/ecocyc_parser.py index 6da9d45..b5b2012 100644 --- a/RILseq/ecocyc_parser.py +++ b/RILseq/ecocyc_parser.py @@ -134,7 +134,7 @@ def generate_transcripts_file( if tu_str == '+': tu_boundaries[tu][0] = tu_promoters[tu] else: - tu_boundaries[tu][1] = tu_promoters[tu] + tu_boundaries[tu][1] = tu_promoters[tu] + 1 else: # Get first gene if tu_str == '+': @@ -145,7 +145,7 @@ def generate_transcripts_file( tu_boundaries[tu][1] = min(last_pos+utr_len, fsa_lens[tu_chrn]) if tu in tu_terminators: if tu_str == '+': - tu_boundaries[tu][1] = max(tu_terminators[tu])+1 + tu_boundaries[tu][1] = max(tu_terminators[tu]) + 1 else: tu_boundaries[tu][0] = min(tu_terminators[tu]) else: diff --git a/setup.py b/setup.py index 9b4d6d2..1447a1f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.75', + version='0.76', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ From c9a8e5313243e7a162eb64ddc15a53ad94fdfaea Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Tue, 28 Jul 2020 08:42:48 +0300 Subject: [PATCH 3/8] Update version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 999c744..cf33a08 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.77', + version='0.78', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ From c7cd143db3506578c8e4f17a8f5b6920dce6f0c3 Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Wed, 30 Sep 2020 10:08:07 +0300 Subject: [PATCH 4/8] 1. modifying RNAup_cmd defaults to be without additional parameters (for compatability with newer version of RNAup) 2. checking if RNAup is exists 3. expanding the pading of p3_seq from both sides. --- RILseq/__init__.py | 9 ++------- bin/RILseq_significant_regions.py | 22 ++++++++++++++++++---- setup.py | 4 ++-- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/RILseq/__init__.py b/RILseq/__init__.py index a80aeb8..0084963 100644 --- a/RILseq/__init__.py +++ b/RILseq/__init__.py @@ -1269,7 +1269,6 @@ def report_interactions( targets = read_targets(targets_file) singles = read_singles(single_counts) desc = read_annotations(refseq_dir) -# genes_dict = get_genes_dict(ec_dir, est_utr_lens) try: pos_maps, _ = ecocyc_parser.get_mapping(ec_dir, est_utr_lens, linear_chromosome_list=linear_chromosome_list) except IOError: @@ -1394,12 +1393,8 @@ def report_interactions( p5_seqs = get_seqs( r1_chrn, min1_pos-pad_seqs, max1_pos+pad_seqs, r1_str, fsa_seqs, shuffles=shuffles) - if r2_str == '+': - p3_seq = get_seqs( - r2_chrn, min2_pos-pad_seqs, max2_pos, r2_str, fsa_seqs) - else: - p3_seq = get_seqs( - r2_chrn, min2_pos, max2_pos+pad_seqs, r2_str, fsa_seqs) + p3_seq = get_seqs( + r2_chrn, min2_pos-pad_seqs, max2_pos+pad_seqs, r2_str, fsa_seqs) rnrgs = rnup.scoreall(p3_seq, p5_seqs) if shuffles>0: pv = len([r for r in rnrgs.values() if r>=rnrgs['real']])/float( diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py index a7b4083..600cc95 100755 --- a/bin/RILseq_significant_regions.py +++ b/bin/RILseq_significant_regions.py @@ -17,6 +17,7 @@ import sys import argparse from collections import defaultdict +import os import RILseq @@ -114,7 +115,7 @@ def process_command_line(argv): '--run_RNAup', default=False, action='store_true', help='Run RNAup and compute the interactions predicted strength') parser.add_argument( - '--RNAup_cmd', default='RNAup -Xp -w 25 -b -o', + '--RNAup_cmd', default='RNAup', help='Executable of RNAup with its parameters') parser.add_argument( '--pad_seqs', type=int, default=50, @@ -270,12 +271,25 @@ def main(argv=None): max_IP_div_total = mm_sorted[ int(len(mm_sorted)*settings.norm_percentile)] sys.stderr.write("%f\n"%(max_IP_div_total)) - + else: totRNA_counts = defaultdict(int) max_IP_div_total = 0 - if (settings.shuffles ==0 and settings.run_RNAup): - settings.shuffles=-1 + if settings.shuffles == 0 and settings.run_RNAup: + settings.shuffles = -1 + # check if RNAup is exists + if settings.shuffles != 0: + RNAup_path = settings.RNAup_cmd.split(" ")[0] + if not os.path.exists(RNAup_path): + in_path = False + for p in os.environ["PATH"].split(os.pathsep): + if os.path.exists(os.path.join(p, RNAup_path)): + in_path = True + real_path = os.path.join(p, RNAup_path) + settings.RNAup_cmd = " ".join([real_path] + settings.RNAup_cmd.split(" ")[1:]) + if not in_path: + sys.exit("RNAup is not found. Please specify its location") + # Read the additional data to decorate the results with RILseq.report_interactions( region_interactions, sys.stdout, interacting_regions, settings.seglen, diff --git a/setup.py b/setup.py index cf33a08..80b0b65 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,9 @@ def readme(): with open('README.rst') as f: return f.read() - + setup(name='RILseq', - version='0.78', + version='0.79', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ From 04bb8874c5a39ec7707b3695e8f581d0707e2857 Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Wed, 30 Sep 2020 10:34:50 +0300 Subject: [PATCH 5/8] small change in syntax (os.path.isfile instead of os.path.exists to avoid dirs) --- bin/RILseq_significant_regions.py | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py index 600cc95..1b32427 100755 --- a/bin/RILseq_significant_regions.py +++ b/bin/RILseq_significant_regions.py @@ -280,10 +280,10 @@ def main(argv=None): # check if RNAup is exists if settings.shuffles != 0: RNAup_path = settings.RNAup_cmd.split(" ")[0] - if not os.path.exists(RNAup_path): + if not os.path.isfile(RNAup_path): in_path = False for p in os.environ["PATH"].split(os.pathsep): - if os.path.exists(os.path.join(p, RNAup_path)): + if os.path.isfile(os.path.join(p, RNAup_path)): in_path = True real_path = os.path.join(p, RNAup_path) settings.RNAup_cmd = " ".join([real_path] + settings.RNAup_cmd.split(" ")[1:]) diff --git a/setup.py b/setup.py index 80b0b65..86796f6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.79', + version='0.80', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ From 6117f3572043583730a48c311edbff09c36f6152 Mon Sep 17 00:00:00 2001 From: Amir Bar Date: Wed, 30 Dec 2020 17:56:42 +0200 Subject: [PATCH 6/8] Fixed the concordance function to compute cyclic distance instead of only linear. To do so, the chromosome lengths were passed to write_read_tables and additional functions. --- RILseq/__init__.py | 180 +++++++++++++++++++++++++++------- bin/map_chimeric_fragments.py | 93 ++++++++++++++---- 2 files changed, 217 insertions(+), 56 deletions(-) diff --git a/RILseq/__init__.py b/RILseq/__init__.py index 0084963..59196e1 100644 --- a/RILseq/__init__.py +++ b/RILseq/__init__.py @@ -100,7 +100,6 @@ def run_bwa(bwa_cmd, fname1, fname2, output_dir, output_prefix, mismatches, return bamname - def read_gtf(gtf_file, feature, identifier): """ Read a GTF file and return a list in the length of the genome in which each @@ -151,7 +150,6 @@ def read_gtf(gtf_file, feature, identifier): return pos_feat_list, all_features - def get_paired_pos(read, rev=False): """ Given a read which is the positive of the pairs return a strand and @@ -168,6 +166,7 @@ def get_paired_pos(read, rev=False): tpos = read.template_length + fpos # previously termed tlen return strand, fpos, tpos + def get_single_pos(read, rev=False): """ Given a read which is the positive of the pairs return a strand and @@ -193,7 +192,7 @@ def count_features( Go over the samfile and for each pair of reads find the features that overlap the fragment with at least 'overlap' nucleotides. Add 1 to the count of these features - + Arguments: - `features_lists`: The list of features returned from the read_gtf function - `samfile`: A pysam object @@ -202,7 +201,7 @@ def count_features( - `checkpoint`: Report every 100000 reads processed, set to None or False for silencing - `get_sum`: Return the number of reads as well - + Return: - `fcounts`: A dictionary from gene name to number of reads mapped to the gene. ~~antisense and ~~intergenic count the number of reads @@ -456,6 +455,7 @@ def get_unmapped_reads( ouf.write("@%s\n%s\n+\n%s\n"%(read.qname, outseq, outqual)) return single_mapped + def pass_dust_filter(seq, thr): """ Run dust filter and return True of False. The filter is applied as @@ -477,7 +477,7 @@ def pass_dust_filter(seq, thr): # sys.stderr.write("%g\n"%(cscore/(len(seq)-3)*10)) return cscore/(len(seq)-3) * 10 <= thr - + def run_dust_filter(fname, fout, prinseq_cmd, threshold): """ Run the dust filter implemented in prinseq. Read a fastq file and write to @@ -558,7 +558,14 @@ def replace_with_XA(read, al, chrnames_bam): def test_concordance( - read1, read2, maxdist, chrnames_bam, trans_gff=None, remove_self=False): + read1, + read2, + maxdist, + chrnames_bam, + chrlens_bam, + trans_gff=None, + remove_self=False +): """ Test if the two reads can be concordant and not ligated. If it finds a combination that is concordant, replace the concordant @@ -571,71 +578,145 @@ def test_concordance( - `read2`: read 2 object - `maxdist`: Maximal distance to consider concordance if not on the same transcript and have same orientation (possibly self ligation) - - `chrnames_bam`: A list of chr names as they appear in the bam file. + - `chrnames_bam: A list of chr names as they appear in the bam file. can be generated from Samfile.getrname() function + - `chrlens_bam: A list of chr lengths as they appear in the bam file. + can be generated using Samfile.get_reference_length() + function. - `trans_gff`: A dict IGT->(from, to, strand) - `remove_self`: Remove pairs that have the same orientation but different order i.e. r1--->r2---> instead of r2--->r1---> """ - def is_conc(str1, str2, pos1, pos2, chr1, chr2): + def is_conc(str1, str2, pos1, pos2, chr1, chr2, chr_len): """ """ - if str1 != str2: + # Same strand and chromosome + if chr1 != chr2 or str1 != str2: return False - if chr1 != chr2: - return False - if abs(pos2-pos1)pos2)==str1 or remove_self): - return True + + # If orientation is |--r1-----r2--| + if pos1 < pos2: + regular_dist = pos2 - pos1 + cyclic_dist = (chr_len - pos2) + pos1 + 1 + + # If ignore orientation look only on the distances + if remove_self and min(regular_dist, cyclic_dist) < maxdist: + return True + + # Forward strand + elif regular_dist < maxdist and not str1: + return True + + # Reverse strand + elif cyclic_dist < maxdist and str1: + return True + + # If orientation is |--r2-----r1--| + else: + regular_dist = pos1 - pos2 + cyclic_dist = (chr_len - pos1) + pos2 + 1 + + # If ignore orientation look only on the distances + if remove_self and min(regular_dist, cyclic_dist) < maxdist: + return True + + # Reverse strand + elif regular_dist < maxdist and str1: + return True + + # Forward strand + elif cyclic_dist < maxdist and not str1: + return True + if trans_gff: in_trans = False + for tu_pos in trans_gff.values(): - if (tu_pos[2]=='-')==str1: - if tu_pos[0]<=pos1<=tu_pos[1] and tu_pos[0]<=pos2<=tu_pos[1]: + if (tu_pos[2] == '-') == str1: + if tu_pos[0] <= pos1 <= tu_pos[1] and tu_pos[0] <= pos2 <= tu_pos[1]: in_trans = True break - if in_trans and ((pos1>pos2)==str1 or remove_self): + + if in_trans and ((pos1 > pos2) == str1 or remove_self): return True + return False - + # read1 is the reverse of the real read. If both directions are the same # and distance is short they can be concordant - - if read1.is_reverse == read2.is_reverse and read1.reference_id==read2.reference_id: + # TODO: Why do we test it here? this is exactly why is conc is doing inside + if read1.is_reverse == read2.is_reverse and read1.reference_id == read2.reference_id: if is_conc( - read1.is_reverse, read2.is_reverse, read1.reference_start, read2.reference_start, - read1.reference_id, read2.reference_id): + read1.is_reverse, + read2.is_reverse, + read1.reference_start, + read2.reference_start, + read1.reference_id, + read2.reference_id, + chrlens_bam[read1.reference_id] + ): return True + r1_XA = get_XA_mapping(read1.get_tags()) r2_XA = get_XA_mapping(read2.get_tags()) + if r1_XA: for altp in r1_XA: is_rev1 = (altp[1][0] == '-') pos1 = abs(int(altp[1])) + if is_conc( - is_rev1, read2.is_reverse, pos1, read2.reference_start, altp[0], - chrnames_bam[read2.reference_id]): + is_rev1, + read2.is_reverse, + pos1, + read2.reference_start, + altp[0], + chrnames_bam[read2.reference_id], + chrlens_bam[read2.reference_id] + ): # Replace with alternative replace_with_XA(read1, altp, chrnames_bam) return True + for altp2 in r2_XA: is_rev2 = (altp2[1][0] == '-') pos2 = abs(int(altp2[1])) - if is_conc(is_rev1, is_rev2, pos1, pos2, altp[0], altp2[0]): + + if is_conc( + is_rev1, + is_rev2, + pos1, + pos2, + altp[0], + altp2[0], + chrlens_bam[chrnames_bam.index(altp[0])] + ): # Replace both with alternatives. replace_with_XA(read1, altp, chrnames_bam) replace_with_XA(read2, altp2, chrnames_bam) return True + if r2_XA: for altp2 in r2_XA: is_rev2 = (altp2[1][0] == '-') pos2 = abs(int(altp2[1])) - if is_conc(read1.is_reverse, is_rev2, read1.reference_start, pos2, altp2[0], chrnames_bam[read1.reference_id]): + + if is_conc( + read1.is_reverse, + is_rev2, + read1.reference_start, + pos2, + altp2[0], + chrnames_bam[read1.reference_id], + chrlens_bam[read1.reference_id] + ): # Replace with alternative replace_with_XA(read2, altp2, chrnames_bam) return True - + return False + def read_bam_file(bamfile, chrnames_bam, max_NM=0): """ Given a Samfile object of a single mapped file, return the reads in the file @@ -672,9 +753,18 @@ def read_bam_file(bamfile, chrnames_bam, max_NM=0): def write_reads_table( - outfile, read1_reads, read2_reads, chrnames_bam, maxdist, - remove_self, trans_gff=None, write_single=None, single_mapped=None, - max_NM=1): + outfile, + read1_reads, + read2_reads, + chrnames_bam, + chrlens_bam, + maxdist, + remove_self, + trans_gff=None, + write_single=None, + single_mapped=None, + max_NM=1 +): """ Read the lists of reads and print a list of chimeric fragments after removing concordant reads @@ -684,6 +774,7 @@ def write_reads_table( - `read2_reads`: A dictionary of reads from side 2, keys of 1 and 2 should match - `chrnames_bam`: A list of chromosome names in the bam file + - `chrlens_bam`: A list of chromosome lengths in the bam file - `maxdist`: Maximal distance between concordant reads - `remove_self`: Remove circular RNAs - `trans_gff`: A dictionary with transcripts positions, optional @@ -692,47 +783,64 @@ def write_reads_table( mapped as single - `max_NM`: Maximla number of mismatches. Used to screen printing of singles """ - for rname in read1_reads: if rname not in read2_reads: continue + # If the two reads share at least one gene they are excluded write_to = outfile read_type = "chimera" + if test_concordance( - read1_reads[rname], read2_reads[rname], maxdist, - chrnames_bam, trans_gff=trans_gff, remove_self=remove_self): + read1_reads[rname], + read2_reads[rname], + maxdist, + chrnames_bam, + chrlens_bam, + trans_gff=trans_gff, + remove_self=remove_self + ): + if write_single: if get_NM_number(read1_reads[rname].get_tags()) > max_NM or\ get_NM_number(read2_reads[rname].get_tags()) > max_NM: continue + write_to = write_single read_type = "single" + else: continue + else: # If it was originally mapped as single and now as chimeric # ignore this read if single_mapped and rname in single_mapped: continue + read1_chrn = chrnames_bam[read1_reads[rname].reference_id] read2_chrn = chrnames_bam[read2_reads[rname].reference_id] + if not read2_reads[rname].is_reverse: - end2_pos = read2_reads[rname].reference_start+read2_reads[rname].reference_length-1 + end2_pos = read2_reads[rname].reference_start + read2_reads[rname].reference_length - 1 end2_str = '+' + else: end2_pos = read2_reads[rname].reference_start end2_str = '-' + if read1_reads[rname].is_reverse: - end1_pos = read1_reads[rname].reference_start+read1_reads[rname].reference_length-1 + end1_pos = read1_reads[rname].reference_start + read1_reads[rname].reference_length - 1 end1_str = '-' + else: end1_pos = read1_reads[rname].reference_start end1_str = '+' + write_to.write( - "%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n"%( - read1_chrn, end1_pos+1, end1_str, read2_chrn, end2_pos+1, + "%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( + read1_chrn, end1_pos + 1, end1_str, read2_chrn, end2_pos + 1, end2_str, rname, read_type)) diff --git a/bin/map_chimeric_fragments.py b/bin/map_chimeric_fragments.py index 3b44c0e..a449d37 100644 --- a/bin/map_chimeric_fragments.py +++ b/bin/map_chimeric_fragments.py @@ -23,6 +23,7 @@ import RILseq + def process_command_line(argv): """ Return a 2-tuple: (settings object, args list). @@ -130,66 +131,118 @@ def process_command_line(argv): def main(argv=None): sys.stderr.write("RILseq version: {}\n".format(pkg_resources.get_distribution("RILseq").version)) settings = process_command_line(argv) + # Read the transcripts if given try: os.makedirs(settings.dirout) + except OSError as e: if e.errno != errno.EEXIST: raise + if settings.transcripts: trans_dict = RILseq.read_transcripts(settings.transcripts, settings.feature, settings.identifier) + else: trans_dict = None + # Get the ends of the reads from the bam files # sys.stderr.write('%s\n'%str(settings.bamfiles)) if settings.all_reads: try: outall = open(settings.all_reads, 'w') + except IOError: outall = None + elif settings.add_all_reads: outall = sys.stdout + else: outall = None + for bf in RILseq.flat_list(settings.bamfiles): - bfin = pysam.AlignmentFile(bf,'rb') + bfin = pysam.AlignmentFile(bf, 'rb') outhead = bf.rsplit('.', 1)[0] - libname = outhead.rsplit('/',1)[-1] - fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname) - fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname) + libname = outhead.rsplit('/', 1)[-1] + fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname) + fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname) + if settings.skip_mapping: fsq1 = open(os.devnull, 'w') fsq2 = fsq1 + else: fsq1 = open(fsq1name, 'w') fsq2 = open(fsq2name, 'w') + single_mapped = RILseq.get_unmapped_reads( - bfin, fsq1, fsq2, settings.length, settings.maxG, - rev=settings.reverse_complement, all_reads=True, - dust_thr=settings.dust_thr) + bfin, + fsq1, + fsq2, + settings.length, + settings.maxG, + rev=settings.reverse_complement, + all_reads=True, + dust_thr=settings.dust_thr + ) + reads_in = [] + # Map the fastq files to the genome for fqname in (fsq1name, fsq2name): - bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1] + bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1] + if settings.skip_mapping: - bamname = "%s/%s.bam"%(settings.dirout, bamheadname) + bamname = "%s/%s.bam" % (settings.dirout, bamheadname) + else: bamname = RILseq.run_bwa( - settings.bwa_exec, fqname, None, - os.path.abspath(settings.dirout), bamheadname, settings.max_mismatches, - os.path.abspath(settings.genome_fasta), settings.params_aln, - '', settings.samse_params, - settings.samtools_cmd) - bamin = pysam.AlignmentFile(bamname,'rb') + settings.bwa_exec, + fqname, + None, + os.path.abspath(settings.dirout), + bamheadname, + settings.max_mismatches, + os.path.abspath(settings.genome_fasta), + settings.params_aln, + '', + settings.samse_params, + settings.samtools_cmd + ) + + bamin = pysam.AlignmentFile(bamname, 'rb') reads_in.append(RILseq.read_bam_file( - bamin, bamin.references, settings.allowed_mismatches)) + bamin, + bamin.references, + settings.allowed_mismatches + )) + + # TODO: Preferably these should be a dictionary + # the chrnames_bam is used to get the name of a chromosome from a + # read in the bamfile (read.reference_id). However, it should be + # accessible using read.reference_name. Maybe there is a reason + # I'm not aware of it is written this way + chrnames_bam = bfin.references + chrlens_bam = [bfin.get_reference_length(ref) for ref in chrnames_bam] + RILseq.write_reads_table( - sys.stdout, reads_in[0], reads_in[1], bfin.references, - settings.distance, not settings.keep_circular, - trans_dict, write_single=outall, single_mapped=single_mapped, - max_NM=settings.allowed_mismatches) + sys.stdout, + reads_in[0], + reads_in[1], + chrnames_bam, + chrlens_bam, + settings.distance, + not settings.keep_circular, + trans_dict, + write_single=outall, + single_mapped=single_mapped, + max_NM=settings.allowed_mismatches + ) + return 0 # success + if __name__ == '__main__': status = main() sys.exit(status) From e3d2dfb4131303ea310a9e5d89be7397f2834060 Mon Sep 17 00:00:00 2001 From: Yair Raz Date: Wed, 30 Dec 2020 20:45:44 +0200 Subject: [PATCH 7/8] Updating the RILseq version to 0.81 after Amir's fix. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 86796f6..4e8b549 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.80', + version='0.81', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[ From 73ae0ff3d3229331a1b1ff38ea31e5870902f79d Mon Sep 17 00:00:00 2001 From: yairra Date: Sun, 3 Apr 2022 22:52:37 +0300 Subject: [PATCH 8/8] 1. change header of files to python3 2. post_processing.py - update to current file naming 3. plot_regions_interactions.py - fix division issue caused by migration to python 3 4. change version to 0.82 --- bin/RILseq_significant_regions.py | 2 +- bin/count_chimeric_reads_per_gene.py | 2 +- bin/generate_BED_file_of_endpoints.py | 2 +- bin/generate_genes_gff.py | 2 +- bin/generate_transcripts_gff.py | 4 ++-- bin/get_sequences_for_meme.py | 2 +- bin/map_chimeric_fragments.py | 4 +--- bin/map_single_fragments.py | 2 +- bin/plot_circos_plot.py | 2 +- bin/plot_regions_interactions.py | 14 +++++++------- bin/post_processing.py | 25 +++++++++++++------------ setup.py | 2 +- 12 files changed, 31 insertions(+), 32 deletions(-) diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py index 1b32427..5ea78e0 100755 --- a/bin/RILseq_significant_regions.py +++ b/bin/RILseq_significant_regions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Read the table of chimeric fragments and find regions that are over-represented diff --git a/bin/count_chimeric_reads_per_gene.py b/bin/count_chimeric_reads_per_gene.py index 462d698..77c86ad 100644 --- a/bin/count_chimeric_reads_per_gene.py +++ b/bin/count_chimeric_reads_per_gene.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Count how many reads overlap each gene in a gff file diff --git a/bin/generate_BED_file_of_endpoints.py b/bin/generate_BED_file_of_endpoints.py index 2b07453..671270d 100755 --- a/bin/generate_BED_file_of_endpoints.py +++ b/bin/generate_BED_file_of_endpoints.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Given already mapped fusions using the reads file (format: diff --git a/bin/generate_genes_gff.py b/bin/generate_genes_gff.py index 496cbf4..7049063 100644 --- a/bin/generate_genes_gff.py +++ b/bin/generate_genes_gff.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Generate a genes gff file using BioCyc data given as flat-files diff --git a/bin/generate_transcripts_gff.py b/bin/generate_transcripts_gff.py index ee7fdd0..9fb6c5a 100644 --- a/bin/generate_transcripts_gff.py +++ b/bin/generate_transcripts_gff.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Generate a gff file of transcripts using an BioCyc data flat-files @@ -48,7 +48,7 @@ def main(argv=None): sys.stdout, utr_len=settings.est_utr_lens, ec_dir=settings.bc_dir, chr_dict=chr_dict) except ValueError as e: - print e.msg + print(e.msg) return 1 # application code here, like: # run(settings, args) diff --git a/bin/get_sequences_for_meme.py b/bin/get_sequences_for_meme.py index 6295b5f..ffa1fc9 100644 --- a/bin/get_sequences_for_meme.py +++ b/bin/get_sequences_for_meme.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Using the summary file (generated by RILseq_significant_regions.py) to diff --git a/bin/map_chimeric_fragments.py b/bin/map_chimeric_fragments.py index a449d37..15a6dcf 100644 --- a/bin/map_chimeric_fragments.py +++ b/bin/map_chimeric_fragments.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ After a library is mapped to the genome (using map_single_fragments.py or any @@ -210,7 +210,6 @@ def main(argv=None): settings.samse_params, settings.samtools_cmd ) - bamin = pysam.AlignmentFile(bamname, 'rb') reads_in.append(RILseq.read_bam_file( bamin, @@ -225,7 +224,6 @@ def main(argv=None): # I'm not aware of it is written this way chrnames_bam = bfin.references chrlens_bam = [bfin.get_reference_length(ref) for ref in chrnames_bam] - RILseq.write_reads_table( sys.stdout, reads_in[0], diff --git a/bin/map_single_fragments.py b/bin/map_single_fragments.py index 8a0bd2c..6345b9e 100644 --- a/bin/map_single_fragments.py +++ b/bin/map_single_fragments.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ This script is used to map fastq files to the genome. The input is a comma diff --git a/bin/plot_circos_plot.py b/bin/plot_circos_plot.py index 90ec70d..40b42d7 100644 --- a/bin/plot_circos_plot.py +++ b/bin/plot_circos_plot.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Read the list of chimeric interactions and generate a file that can be read diff --git a/bin/plot_regions_interactions.py b/bin/plot_regions_interactions.py index 4ee160f..9eaab8a 100755 --- a/bin/plot_regions_interactions.py +++ b/bin/plot_regions_interactions.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 """ Read files with interactions summary (with or without p-values) and plot @@ -71,10 +71,10 @@ def get_singles_counts(sname, seglen, mincounts): if ints < mincounts: continue r1_reg = ( - line['RNA1 chromosome'],int(line['Start of RNA1 first read'])/seglen*seglen, + line['RNA1 chromosome'],int(int(line['Start of RNA1 first read'])/seglen)*seglen, line['RNA1 strand']) r2_reg = ( - line['RNA2 chromosome'],int(line['Start of RNA2 last read'])/seglen*seglen, + line['RNA2 chromosome'],int(int(line['Start of RNA2 last read'])/seglen)*seglen, line['RNA2 strand']) counts[r1_reg] += ints counts[r2_reg] += ints @@ -96,9 +96,9 @@ def get_regions_counts(fname, seglen, mincounts): if int(line['interactions']) < mincounts: continue t_reg = ( - line['RNA1 chromosome'],int(line['Start of RNA1 first read'])/seglen*seglen, + line['RNA1 chromosome'],int(int(line['Start of RNA1 first read'])/seglen)*seglen, line['RNA1 strand'], - line['RNA2 chromosome'],int(line['Start of RNA2 last read'])/seglen*seglen, + line['RNA2 chromosome'],int(int(line['Start of RNA2 last read'])/seglen)*seglen, line['RNA2 strand']) counts[t_reg] = int(line['interactions']) @@ -137,8 +137,8 @@ def scatit(dname, dname2, grd, i, j, l1, l2, lln, cur_name): # grd.cbar_axes[i*lln+j].colorbar(im) grd[i*lln + j].text(10, 10e4, "r=%.2f"%(spr[0]), size=6, color='m') - grd[i*lln + j].set_xlim([-10, 10e5]) - grd[i*lln + j].set_ylim([-10, 10e5]) + grd[i*lln + j].set_xlim([10e0, 10e5]) + grd[i*lln + j].set_ylim([10e0, 10e5]) grd[i*lln + j].set_yscale('log') grd[i*lln + j].set_xscale('log') grd[i*lln + j].set_xticks([10e0, 10e2, 10e4]) diff --git a/bin/post_processing.py b/bin/post_processing.py index 32675fb..72aa449 100644 --- a/bin/post_processing.py +++ b/bin/post_processing.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This script is used to run post processing analysis on the RILseq results. This will generate table S1 in the RILseq essay. @@ -107,7 +108,7 @@ def generate_table_s1(settings, outfile): total_reads = 0 procfile = libname + '_cutadapt_1.fastq.gz' - processed_reads = sum(1 for line in gzip.open(procfile) if line.startswith('@')) + processed_reads = sum(1 for line in gzip.open(procfile) if line.startswith(b'@')) if os.path.exists(procfile.replace('_1.fastq.gz', '_2.fastq.gz')): processed_reads *= 2 if total_reads > 0: @@ -125,8 +126,8 @@ def generate_table_s1(settings, outfile): count_cmd_strict = count_cmd + ['-t'] print(count_cmd) - mapped_reads = Popen(' '.join(count_cmd), shell=True, stdout=PIPE).stdout.read().strip() - mapped_reads_strict, multiple, antisense, igr = Popen(' '.join(count_cmd_strict), shell=True, stdout=PIPE)\ + mapped_reads = Popen(' '.join(count_cmd), shell=True, stdout=PIPE, encoding='utf8').stdout.read().strip() + mapped_reads_strict, multiple, antisense, igr = Popen(' '.join(count_cmd_strict), shell=True, stdout=PIPE, encoding='utf8')\ .stdout.read().strip().split(',') mapped_fraction = "{:1.2f}".format(int(mapped_reads)/float(processed_reads)) @@ -138,26 +139,26 @@ def generate_table_s1(settings, outfile): if settings.RILseq_work_dir: short_name = libname.split('/')[-1] - fastq1 = settings.RILseq_work_dir+short_name+"_cutadapt_bwa_ends_1.fastq" - fastq2 = settings.RILseq_work_dir+short_name+"_cutadapt_bwa_ends_2.fastq" + fastq1 = settings.RILseq_work_dir+"/remapped-data/"+short_name+"_cutadapt_bwa_ends_1.fastq" + fastq2 = settings.RILseq_work_dir+"/remapped-data/"+short_name+"_cutadapt_bwa_ends_2.fastq" if os.path.exists(fastq1): numfrag = len(list(set([line.strip() for line in open(fastq1) if line.startswith('@')]).intersection( [line.strip() for line in open(fastq2) if line.startswith('@')]))) - frags_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt' + frags_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt' singles = sum(1 for line in open(frags_file) if "single" in line) chimeras = sum(1 for line in open(frags_file) if "chimera" in line) - singles_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_only_single.txt' - all_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.txt' - sig_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_sig_interactions_with-total.txt' + singles_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_singles.txt' + all_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_all_interactions.txt' + sig_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_significant_interactions.txt' temp = open(singles_file) - temp.next() + next(temp) map_si_exc_rrna = sum(int(line.split('\t')[14]) for line in temp) temp = open(all_file) - temp.next() + next(temp) map_ch_exc_rrna = sum(int(line.split('\t')[14]) for line in temp) per_mapped = "{:1.1f}%".format(((singles+chimeras)/float(numfrag))*100) @@ -165,7 +166,7 @@ def generate_table_s1(settings, outfile): stat_interactions = sum(1 for line in open(sig_file)) - 1 temp = open(sig_file) - temp.next() + next(temp) stat_frags = sum(int(line.split('\t')[14]) for line in temp) per_stat = "{:1.1f}%".format((stat_frags/float(chimeras))*100) diff --git a/setup.py b/setup.py index 4e8b549..1ffc81b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def readme(): return f.read() setup(name='RILseq', - version='0.81', + version='0.82', description='Processing RILSeq experiments results', long_description=readme(), classifiers=[