From 0a8601a7d8ceaa98e2801040158b2913e269a688 Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Thu, 28 May 2020 15:46:46 +0300
Subject: [PATCH 1/8] changing requierment modules for python2.7 support.

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 569e62e..9b4d6d2 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
     
 setup(name='RILseq',
-      version='0.74',
+      version='0.75',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[
@@ -31,6 +31,6 @@ def readme():
       license='MIT',
       packages=['RILseq'],
       install_requires=[
-        'scipy', 'numpy', 'pysam>=0.14.1', 'biopython'],
+        'scipy', 'numpy', 'pysam>=0.14.1', 'biopython==1.76'],
       include_package_data=True,
       zip_safe=False)

From 879ada99a986be86132c80d26a1afd23b5af3c36 Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Tue, 28 Jul 2020 08:13:05 +0300
Subject: [PATCH 2/8] Resolving some off by 1 issues when writing the table and
 when generating the gff file.

---
 RILseq/__init__.py      | 4 ++--
 RILseq/ecocyc_parser.py | 4 ++--
 setup.py                | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/RILseq/__init__.py b/RILseq/__init__.py
index 213177a..a80aeb8 100644
--- a/RILseq/__init__.py
+++ b/RILseq/__init__.py
@@ -719,13 +719,13 @@ def write_reads_table(
         read1_chrn = chrnames_bam[read1_reads[rname].reference_id]
         read2_chrn = chrnames_bam[read2_reads[rname].reference_id]
         if not read2_reads[rname].is_reverse:
-            end2_pos = read2_reads[rname].reference_start+read2_reads[rname].query_alignment_length-1
+            end2_pos = read2_reads[rname].reference_start+read2_reads[rname].reference_length-1
             end2_str = '+'
         else:
             end2_pos = read2_reads[rname].reference_start
             end2_str = '-'
         if read1_reads[rname].is_reverse:
-            end1_pos = read1_reads[rname].reference_start+read1_reads[rname].query_alignment_length-1
+            end1_pos = read1_reads[rname].reference_start+read1_reads[rname].reference_length-1
             end1_str = '-'
         else:
             end1_pos = read1_reads[rname].reference_start
diff --git a/RILseq/ecocyc_parser.py b/RILseq/ecocyc_parser.py
index 6da9d45..b5b2012 100644
--- a/RILseq/ecocyc_parser.py
+++ b/RILseq/ecocyc_parser.py
@@ -134,7 +134,7 @@ def generate_transcripts_file(
             if tu_str == '+':
                 tu_boundaries[tu][0] = tu_promoters[tu]
             else:
-                tu_boundaries[tu][1] = tu_promoters[tu]
+                tu_boundaries[tu][1] = tu_promoters[tu] + 1
         else:
             # Get first gene
             if tu_str == '+':
@@ -145,7 +145,7 @@ def generate_transcripts_file(
                 tu_boundaries[tu][1] = min(last_pos+utr_len, fsa_lens[tu_chrn])
         if tu in tu_terminators:
             if tu_str == '+':
-                tu_boundaries[tu][1] = max(tu_terminators[tu])+1
+                tu_boundaries[tu][1] = max(tu_terminators[tu]) + 1
             else:
                 tu_boundaries[tu][0] = min(tu_terminators[tu])
         else:
diff --git a/setup.py b/setup.py
index 9b4d6d2..1447a1f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
     
 setup(name='RILseq',
-      version='0.75',
+      version='0.76',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[

From c9a8e5313243e7a162eb64ddc15a53ad94fdfaea Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Tue, 28 Jul 2020 08:42:48 +0300
Subject: [PATCH 3/8] Update version number

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 999c744..cf33a08 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
     
 setup(name='RILseq',
-      version='0.77',
+      version='0.78',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[

From c7cd143db3506578c8e4f17a8f5b6920dce6f0c3 Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Wed, 30 Sep 2020 10:08:07 +0300
Subject: [PATCH 4/8] 1. modifying RNAup_cmd defaults to be without additional
 parameters (for compatability with newer version of RNAup) 2. checking if
 RNAup is exists 3. expanding the pading of p3_seq from both sides.

---
 RILseq/__init__.py                |  9 ++-------
 bin/RILseq_significant_regions.py | 22 ++++++++++++++++++----
 setup.py                          |  4 ++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/RILseq/__init__.py b/RILseq/__init__.py
index a80aeb8..0084963 100644
--- a/RILseq/__init__.py
+++ b/RILseq/__init__.py
@@ -1269,7 +1269,6 @@ def report_interactions(
     targets = read_targets(targets_file)
     singles = read_singles(single_counts)
     desc = read_annotations(refseq_dir)
-#    genes_dict = get_genes_dict(ec_dir, est_utr_lens)
     try:
         pos_maps, _ = ecocyc_parser.get_mapping(ec_dir, est_utr_lens, linear_chromosome_list=linear_chromosome_list)
     except IOError:
@@ -1394,12 +1393,8 @@ def report_interactions(
             p5_seqs = get_seqs(
                 r1_chrn, min1_pos-pad_seqs, max1_pos+pad_seqs, r1_str, fsa_seqs,
                 shuffles=shuffles)
-            if r2_str == '+':
-                p3_seq = get_seqs(
-                    r2_chrn, min2_pos-pad_seqs, max2_pos, r2_str, fsa_seqs)
-            else:
-                p3_seq = get_seqs(
-                    r2_chrn, min2_pos, max2_pos+pad_seqs, r2_str, fsa_seqs)
+            p3_seq = get_seqs(
+                    r2_chrn, min2_pos-pad_seqs, max2_pos+pad_seqs, r2_str, fsa_seqs)
             rnrgs = rnup.scoreall(p3_seq, p5_seqs)
             if shuffles>0:
                 pv = len([r for r in rnrgs.values() if r>=rnrgs['real']])/float(
diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py
index a7b4083..600cc95 100755
--- a/bin/RILseq_significant_regions.py
+++ b/bin/RILseq_significant_regions.py
@@ -17,6 +17,7 @@
 import sys
 import argparse
 from collections import defaultdict
+import os
 
 import RILseq
 
@@ -114,7 +115,7 @@ def process_command_line(argv):
         '--run_RNAup', default=False, action='store_true',
         help='Run RNAup and compute the interactions predicted strength')
     parser.add_argument(
-        '--RNAup_cmd', default='RNAup -Xp -w 25 -b -o',
+        '--RNAup_cmd', default='RNAup',
         help='Executable of RNAup with its parameters')
     parser.add_argument(
         '--pad_seqs', type=int, default=50,
@@ -270,12 +271,25 @@ def main(argv=None):
         max_IP_div_total = mm_sorted[
             int(len(mm_sorted)*settings.norm_percentile)]
         sys.stderr.write("%f\n"%(max_IP_div_total))
-            
+
     else:
         totRNA_counts = defaultdict(int)
         max_IP_div_total = 0
-    if (settings.shuffles ==0 and settings.run_RNAup):
-        settings.shuffles=-1
+    if settings.shuffles == 0 and settings.run_RNAup:
+        settings.shuffles = -1
+    # check if RNAup is exists
+    if settings.shuffles != 0:
+        RNAup_path = settings.RNAup_cmd.split(" ")[0]
+        if not os.path.exists(RNAup_path):
+            in_path = False
+            for p in os.environ["PATH"].split(os.pathsep):
+                if os.path.exists(os.path.join(p, RNAup_path)):
+                    in_path = True
+                    real_path = os.path.join(p, RNAup_path)
+                    settings.RNAup_cmd = " ".join([real_path] + settings.RNAup_cmd.split(" ")[1:])
+            if not in_path:
+                sys.exit("RNAup is not found. Please specify its location")
+
     # Read the additional data to decorate the results with
     RILseq.report_interactions(
         region_interactions, sys.stdout, interacting_regions, settings.seglen,
diff --git a/setup.py b/setup.py
index cf33a08..80b0b65 100644
--- a/setup.py
+++ b/setup.py
@@ -3,9 +3,9 @@
 def readme():
     with open('README.rst') as f:
         return f.read()
-    
+
 setup(name='RILseq',
-      version='0.78',
+      version='0.79',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[

From 04bb8874c5a39ec7707b3695e8f581d0707e2857 Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Wed, 30 Sep 2020 10:34:50 +0300
Subject: [PATCH 5/8] small change in syntax (os.path.isfile instead of
 os.path.exists to avoid dirs)

---
 bin/RILseq_significant_regions.py | 4 ++--
 setup.py                          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py
index 600cc95..1b32427 100755
--- a/bin/RILseq_significant_regions.py
+++ b/bin/RILseq_significant_regions.py
@@ -280,10 +280,10 @@ def main(argv=None):
     # check if RNAup is exists
     if settings.shuffles != 0:
         RNAup_path = settings.RNAup_cmd.split(" ")[0]
-        if not os.path.exists(RNAup_path):
+        if not os.path.isfile(RNAup_path):
             in_path = False
             for p in os.environ["PATH"].split(os.pathsep):
-                if os.path.exists(os.path.join(p, RNAup_path)):
+                if os.path.isfile(os.path.join(p, RNAup_path)):
                     in_path = True
                     real_path = os.path.join(p, RNAup_path)
                     settings.RNAup_cmd = " ".join([real_path] + settings.RNAup_cmd.split(" ")[1:])
diff --git a/setup.py b/setup.py
index 80b0b65..86796f6 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
 
 setup(name='RILseq',
-      version='0.79',
+      version='0.80',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[

From 6117f3572043583730a48c311edbff09c36f6152 Mon Sep 17 00:00:00 2001
From: Amir Bar <amier.cz@gmail.com>
Date: Wed, 30 Dec 2020 17:56:42 +0200
Subject: [PATCH 6/8] Fixed the concordance function to compute cyclic distance
 instead of only linear. To do so, the chromosome lengths were passed to
 write_read_tables and additional functions.

---
 RILseq/__init__.py            | 180 +++++++++++++++++++++++++++-------
 bin/map_chimeric_fragments.py |  93 ++++++++++++++----
 2 files changed, 217 insertions(+), 56 deletions(-)

diff --git a/RILseq/__init__.py b/RILseq/__init__.py
index 0084963..59196e1 100644
--- a/RILseq/__init__.py
+++ b/RILseq/__init__.py
@@ -100,7 +100,6 @@ def run_bwa(bwa_cmd, fname1, fname2, output_dir, output_prefix, mismatches,
     return bamname
 
 
-
 def read_gtf(gtf_file, feature, identifier):
     """
     Read a GTF file and return a list in the length of the genome in which each
@@ -151,7 +150,6 @@ def read_gtf(gtf_file, feature, identifier):
     return pos_feat_list, all_features
 
 
-
 def get_paired_pos(read, rev=False):
     """
     Given a read which is the positive of the pairs return a strand and
@@ -168,6 +166,7 @@ def get_paired_pos(read, rev=False):
     tpos = read.template_length + fpos  # previously termed tlen
     return strand, fpos, tpos
 
+
 def get_single_pos(read, rev=False):
     """
     Given a read which is the positive of the pairs return a strand and
@@ -193,7 +192,7 @@ def count_features(
     Go over the samfile and for each pair of reads find the features that
     overlap the fragment with at least 'overlap' nucleotides. Add 1 to the count
     of these features
-    
+
     Arguments:
     - `features_lists`: The list of features returned from the read_gtf function
     - `samfile`: A pysam object
@@ -202,7 +201,7 @@ def count_features(
     - `checkpoint`: Report every 100000 reads processed, set to None or False
                     for silencing
     - `get_sum`: Return the number of reads as well
-                    
+
     Return:
     - `fcounts`: A dictionary from gene name to number of reads mapped to the
                  gene. ~~antisense and ~~intergenic count the number of reads
@@ -456,6 +455,7 @@ def get_unmapped_reads(
                 ouf.write("@%s\n%s\n+\n%s\n"%(read.qname, outseq, outqual))
     return single_mapped
 
+
 def pass_dust_filter(seq, thr):
     """
     Run dust filter and return True of False. The filter is applied as
@@ -477,7 +477,7 @@ def pass_dust_filter(seq, thr):
 #    sys.stderr.write("%g\n"%(cscore/(len(seq)-3)*10))
     return cscore/(len(seq)-3) * 10 <= thr
 
-            
+
 def run_dust_filter(fname, fout, prinseq_cmd, threshold):
     """
     Run the dust filter implemented in prinseq. Read a fastq file and write to
@@ -558,7 +558,14 @@ def replace_with_XA(read, al, chrnames_bam):
 
 
 def test_concordance(
-    read1, read2, maxdist, chrnames_bam, trans_gff=None, remove_self=False):
+    read1,
+    read2,
+    maxdist,
+    chrnames_bam,
+    chrlens_bam,
+    trans_gff=None,
+    remove_self=False
+):
     """
     Test if the two reads can be concordant and not ligated.
     If it finds a combination that is concordant, replace the concordant
@@ -571,71 +578,145 @@ def test_concordance(
     - `read2`: read 2 object
     - `maxdist`: Maximal distance to consider concordance if not on the same
                  transcript and have same orientation (possibly self ligation)
-    - `chrnames_bam`: A list of chr names as they appear in the bam file.
+    - `chrnames_bam: A list of chr names as they appear in the bam file.
                       can be generated from Samfile.getrname() function
+    - `chrlens_bam: A list of chr lengths as they appear in the bam file.
+                     can be generated using Samfile.get_reference_length()
+                     function.
     - `trans_gff`: A dict IGT->(from, to, strand)
     - `remove_self`: Remove pairs that have the same orientation but different
                      order i.e. r1--->r2---> instead of r2--->r1--->
     """
-    def is_conc(str1, str2, pos1, pos2, chr1, chr2):
+    def is_conc(str1, str2, pos1, pos2, chr1, chr2, chr_len):
         """
         """
-        if str1 != str2:
+        # Same strand and chromosome
+        if chr1 != chr2 or str1 != str2:
             return False
-        if chr1 != chr2:
-            return False
-        if abs(pos2-pos1)<maxdist and ((pos1>pos2)==str1 or remove_self):
-            return True
+
+        # If orientation is |--r1-----r2--|
+        if pos1 < pos2:
+            regular_dist = pos2 - pos1
+            cyclic_dist = (chr_len - pos2) + pos1 + 1
+
+            # If ignore orientation look only on the distances
+            if remove_self and min(regular_dist, cyclic_dist) < maxdist:
+                return True
+
+            # Forward strand
+            elif regular_dist < maxdist and not str1:
+                return True
+
+            # Reverse strand
+            elif cyclic_dist < maxdist and str1:
+                return True
+
+        # If orientation is |--r2-----r1--|
+        else:
+            regular_dist = pos1 - pos2
+            cyclic_dist = (chr_len - pos1) + pos2 + 1
+
+            # If ignore orientation look only on the distances
+            if remove_self and min(regular_dist, cyclic_dist) < maxdist:
+                return True
+
+            # Reverse strand
+            elif regular_dist < maxdist and str1:
+                return True
+
+            # Forward strand
+            elif cyclic_dist < maxdist and not str1:
+                return True
+
         if trans_gff:
             in_trans = False
+
             for tu_pos in trans_gff.values():
-                if (tu_pos[2]=='-')==str1:
-                    if tu_pos[0]<=pos1<=tu_pos[1] and tu_pos[0]<=pos2<=tu_pos[1]:
+                if (tu_pos[2] == '-') == str1:
+                    if tu_pos[0] <= pos1 <= tu_pos[1] and tu_pos[0] <= pos2 <= tu_pos[1]:
                         in_trans = True
                         break
-            if in_trans and ((pos1>pos2)==str1 or remove_self):
+
+            if in_trans and ((pos1 > pos2) == str1 or remove_self):
                 return True
+
         return False
-                        
+
     # read1 is the reverse of the real read. If both directions are the same
     # and distance is short they can be concordant
-    
-    if read1.is_reverse == read2.is_reverse and read1.reference_id==read2.reference_id:
+    # TODO: Why do we test it here? this is exactly why is conc is doing inside
+    if read1.is_reverse == read2.is_reverse and read1.reference_id == read2.reference_id:
         if is_conc(
-            read1.is_reverse, read2.is_reverse, read1.reference_start, read2.reference_start,
-            read1.reference_id, read2.reference_id):
+            read1.is_reverse,
+            read2.is_reverse,
+            read1.reference_start,
+            read2.reference_start,
+            read1.reference_id,
+            read2.reference_id,
+            chrlens_bam[read1.reference_id]
+        ):
             return True
+
     r1_XA = get_XA_mapping(read1.get_tags())
     r2_XA = get_XA_mapping(read2.get_tags())
+
     if r1_XA:
         for altp in r1_XA:
             is_rev1 = (altp[1][0] == '-')
             pos1 = abs(int(altp[1]))
+
             if is_conc(
-                is_rev1, read2.is_reverse, pos1, read2.reference_start, altp[0],
-                chrnames_bam[read2.reference_id]):
+                is_rev1,
+                read2.is_reverse,
+                pos1,
+                read2.reference_start,
+                altp[0],
+                chrnames_bam[read2.reference_id],
+                chrlens_bam[read2.reference_id]
+            ):
                 # Replace with alternative
                 replace_with_XA(read1, altp, chrnames_bam)
                 return True
+
             for altp2 in r2_XA:
                 is_rev2 = (altp2[1][0] == '-')
                 pos2 = abs(int(altp2[1]))
-                if is_conc(is_rev1, is_rev2, pos1, pos2, altp[0], altp2[0]):
+
+                if is_conc(
+                    is_rev1,
+                    is_rev2,
+                    pos1,
+                    pos2,
+                    altp[0],
+                    altp2[0],
+                    chrlens_bam[chrnames_bam.index(altp[0])]
+                ):
                     # Replace both with alternatives.
                     replace_with_XA(read1, altp, chrnames_bam)
                     replace_with_XA(read2, altp2, chrnames_bam)
                     return True
+
     if r2_XA:
         for altp2 in r2_XA:
             is_rev2 = (altp2[1][0] == '-')
             pos2 = abs(int(altp2[1]))
-            if is_conc(read1.is_reverse, is_rev2, read1.reference_start, pos2, altp2[0], chrnames_bam[read1.reference_id]):
+
+            if is_conc(
+                read1.is_reverse,
+                is_rev2,
+                read1.reference_start,
+                pos2,
+                altp2[0],
+                chrnames_bam[read1.reference_id],
+                chrlens_bam[read1.reference_id]
+            ):
                 # Replace with alternative
                 replace_with_XA(read2, altp2, chrnames_bam)
                 return True
-        
+
     return False
 
+
 def read_bam_file(bamfile, chrnames_bam, max_NM=0):
     """
     Given a Samfile object of a single mapped file, return the reads in the file
@@ -672,9 +753,18 @@ def read_bam_file(bamfile, chrnames_bam, max_NM=0):
 
 
 def write_reads_table(
-    outfile, read1_reads, read2_reads, chrnames_bam, maxdist,
-    remove_self, trans_gff=None, write_single=None, single_mapped=None,
-    max_NM=1):
+    outfile,
+    read1_reads,
+    read2_reads,
+    chrnames_bam,
+    chrlens_bam,
+    maxdist,
+    remove_self,
+    trans_gff=None,
+    write_single=None,
+    single_mapped=None,
+    max_NM=1
+):
     """
     Read the lists of reads and print a list of chimeric fragments after
     removing concordant reads
@@ -684,6 +774,7 @@ def write_reads_table(
     - `read2_reads`: A dictionary of reads from side 2, keys of 1 and 2 should
                      match
     - `chrnames_bam`: A list of chromosome names in the bam file
+    - `chrlens_bam`: A list of chromosome lengths in the bam file
     - `maxdist`: Maximal distance between concordant reads
     - `remove_self`: Remove circular RNAs
     - `trans_gff`: A dictionary with transcripts positions, optional
@@ -692,47 +783,64 @@ def write_reads_table(
                        mapped as single
     - `max_NM`: Maximla number of mismatches. Used to screen printing of singles
     """
-    
 
     for rname in read1_reads:
         if rname not in read2_reads:
             continue
+
         # If the two reads share at least one gene they are excluded
         write_to = outfile
         read_type = "chimera"
+
         if test_concordance(
-            read1_reads[rname], read2_reads[rname], maxdist,
-            chrnames_bam, trans_gff=trans_gff, remove_self=remove_self):
+            read1_reads[rname],
+            read2_reads[rname],
+            maxdist,
+            chrnames_bam,
+            chrlens_bam,
+            trans_gff=trans_gff,
+            remove_self=remove_self
+        ):
+
             if write_single:
                 if get_NM_number(read1_reads[rname].get_tags()) > max_NM or\
                         get_NM_number(read2_reads[rname].get_tags()) > max_NM:
                     continue
+
                 write_to = write_single
                 read_type = "single"
+
             else:
                 continue
+
         else:
             # If it was originally mapped as single and now as chimeric
             # ignore this read
             if single_mapped and rname in single_mapped:
                 continue
+
         read1_chrn = chrnames_bam[read1_reads[rname].reference_id]
         read2_chrn = chrnames_bam[read2_reads[rname].reference_id]
+
         if not read2_reads[rname].is_reverse:
-            end2_pos = read2_reads[rname].reference_start+read2_reads[rname].reference_length-1
+            end2_pos = read2_reads[rname].reference_start + read2_reads[rname].reference_length - 1
             end2_str = '+'
+
         else:
             end2_pos = read2_reads[rname].reference_start
             end2_str = '-'
+
         if read1_reads[rname].is_reverse:
-            end1_pos = read1_reads[rname].reference_start+read1_reads[rname].reference_length-1
+            end1_pos = read1_reads[rname].reference_start + read1_reads[rname].reference_length - 1
             end1_str = '-'
+
         else:
             end1_pos = read1_reads[rname].reference_start
             end1_str = '+'
+
         write_to.write(
-            "%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n"%(
-                read1_chrn, end1_pos+1, end1_str, read2_chrn, end2_pos+1,
+            "%s\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
+                read1_chrn, end1_pos + 1, end1_str, read2_chrn, end2_pos + 1,
                 end2_str, rname, read_type))
 
 
diff --git a/bin/map_chimeric_fragments.py b/bin/map_chimeric_fragments.py
index 3b44c0e..a449d37 100644
--- a/bin/map_chimeric_fragments.py
+++ b/bin/map_chimeric_fragments.py
@@ -23,6 +23,7 @@
 
 import RILseq
 
+
 def process_command_line(argv):
     """
     Return a 2-tuple: (settings object, args list).
@@ -130,66 +131,118 @@ def process_command_line(argv):
 def main(argv=None):
     sys.stderr.write("RILseq version: {}\n".format(pkg_resources.get_distribution("RILseq").version))
     settings = process_command_line(argv)
+
     # Read the transcripts if given
     try:
         os.makedirs(settings.dirout)
+
     except OSError as e:
         if e.errno != errno.EEXIST:
             raise
+
     if settings.transcripts:
         trans_dict = RILseq.read_transcripts(settings.transcripts, settings.feature, settings.identifier)
+
     else:
         trans_dict = None
+
     # Get the ends of the reads from the bam files
 #    sys.stderr.write('%s\n'%str(settings.bamfiles))
     if settings.all_reads:
         try:
             outall = open(settings.all_reads, 'w')
+
         except IOError:
             outall = None
+
     elif settings.add_all_reads:
         outall = sys.stdout
+
     else:
         outall = None
+
     for bf in RILseq.flat_list(settings.bamfiles):
-        bfin = pysam.AlignmentFile(bf,'rb')
+        bfin = pysam.AlignmentFile(bf, 'rb')
         outhead = bf.rsplit('.', 1)[0]
-        libname = outhead.rsplit('/',1)[-1]
-        fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname)
-        fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname)
+        libname = outhead.rsplit('/', 1)[-1]
+        fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname)
+        fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname)
+
         if settings.skip_mapping:
             fsq1 = open(os.devnull, 'w')
             fsq2 = fsq1
+
         else:
             fsq1 = open(fsq1name, 'w')
             fsq2 = open(fsq2name, 'w')
+
         single_mapped = RILseq.get_unmapped_reads(
-            bfin, fsq1, fsq2, settings.length, settings.maxG,
-            rev=settings.reverse_complement, all_reads=True,
-            dust_thr=settings.dust_thr)
+            bfin,
+            fsq1,
+            fsq2,
+            settings.length,
+            settings.maxG,
+            rev=settings.reverse_complement,
+            all_reads=True,
+            dust_thr=settings.dust_thr
+        )
+
         reads_in = []
+
         # Map the fastq files to the genome
         for fqname in (fsq1name, fsq2name):
-            bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1]
+            bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1]
+
             if settings.skip_mapping:
-                bamname = "%s/%s.bam"%(settings.dirout, bamheadname)
+                bamname = "%s/%s.bam" % (settings.dirout, bamheadname)
+
             else:
                 bamname = RILseq.run_bwa(
-                    settings.bwa_exec, fqname, None,
-                    os.path.abspath(settings.dirout), bamheadname, settings.max_mismatches,
-                    os.path.abspath(settings.genome_fasta), settings.params_aln,
-                    '', settings.samse_params,
-                    settings.samtools_cmd)
-            bamin = pysam.AlignmentFile(bamname,'rb')
+                    settings.bwa_exec,
+                    fqname,
+                    None,
+                    os.path.abspath(settings.dirout),
+                    bamheadname,
+                    settings.max_mismatches,
+                    os.path.abspath(settings.genome_fasta),
+                    settings.params_aln,
+                    '',
+                    settings.samse_params,
+                    settings.samtools_cmd
+                )
+
+            bamin = pysam.AlignmentFile(bamname, 'rb')
             reads_in.append(RILseq.read_bam_file(
-                    bamin, bamin.references, settings.allowed_mismatches))
+                bamin,
+                bamin.references,
+                settings.allowed_mismatches
+            ))
+
+        # TODO: Preferably these should be a dictionary
+        # the chrnames_bam is used to get the name of a chromosome from a
+        # read in the bamfile (read.reference_id). However, it should be
+        # accessible using read.reference_name. Maybe there is a reason
+        # I'm not aware of it is written this way
+        chrnames_bam = bfin.references
+        chrlens_bam = [bfin.get_reference_length(ref) for ref in chrnames_bam]
+
         RILseq.write_reads_table(
-            sys.stdout, reads_in[0], reads_in[1], bfin.references,
-            settings.distance, not settings.keep_circular,
-            trans_dict, write_single=outall, single_mapped=single_mapped,
-            max_NM=settings.allowed_mismatches)
+            sys.stdout,
+            reads_in[0],
+            reads_in[1],
+            chrnames_bam,
+            chrlens_bam,
+            settings.distance,
+            not settings.keep_circular,
+            trans_dict,
+            write_single=outall,
+            single_mapped=single_mapped,
+            max_NM=settings.allowed_mismatches
+        )
+
     return 0        # success
 
+
 if __name__ == '__main__':
     status = main()
     sys.exit(status)

From e3d2dfb4131303ea310a9e5d89be7397f2834060 Mon Sep 17 00:00:00 2001
From: Yair Raz <yair.raz@mail.huji.ac.il>
Date: Wed, 30 Dec 2020 20:45:44 +0200
Subject: [PATCH 7/8] Updating the RILseq version to 0.81 after Amir's fix.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 86796f6..4e8b549 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
 
 setup(name='RILseq',
-      version='0.80',
+      version='0.81',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[

From 73ae0ff3d3229331a1b1ff38ea31e5870902f79d Mon Sep 17 00:00:00 2001
From: yairra <yair.raz@mail.huji.ac.il>
Date: Sun, 3 Apr 2022 22:52:37 +0300
Subject: [PATCH 8/8] 1. change header of files to python3 2.
 post_processing.py - update to current file naming 3.
 plot_regions_interactions.py - fix division issue caused by migration to
 python 3 4. change version to 0.82

---
 bin/RILseq_significant_regions.py     |  2 +-
 bin/count_chimeric_reads_per_gene.py  |  2 +-
 bin/generate_BED_file_of_endpoints.py |  2 +-
 bin/generate_genes_gff.py             |  2 +-
 bin/generate_transcripts_gff.py       |  4 ++--
 bin/get_sequences_for_meme.py         |  2 +-
 bin/map_chimeric_fragments.py         |  4 +---
 bin/map_single_fragments.py           |  2 +-
 bin/plot_circos_plot.py               |  2 +-
 bin/plot_regions_interactions.py      | 14 +++++++-------
 bin/post_processing.py                | 25 +++++++++++++------------
 setup.py                              |  2 +-
 12 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/bin/RILseq_significant_regions.py b/bin/RILseq_significant_regions.py
index 1b32427..5ea78e0 100755
--- a/bin/RILseq_significant_regions.py
+++ b/bin/RILseq_significant_regions.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Read the table of chimeric fragments and find regions that are over-represented
diff --git a/bin/count_chimeric_reads_per_gene.py b/bin/count_chimeric_reads_per_gene.py
index 462d698..77c86ad 100644
--- a/bin/count_chimeric_reads_per_gene.py
+++ b/bin/count_chimeric_reads_per_gene.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Count how many reads overlap each gene in a gff file
diff --git a/bin/generate_BED_file_of_endpoints.py b/bin/generate_BED_file_of_endpoints.py
index 2b07453..671270d 100755
--- a/bin/generate_BED_file_of_endpoints.py
+++ b/bin/generate_BED_file_of_endpoints.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Given already mapped fusions using the reads file (format:
diff --git a/bin/generate_genes_gff.py b/bin/generate_genes_gff.py
index 496cbf4..7049063 100644
--- a/bin/generate_genes_gff.py
+++ b/bin/generate_genes_gff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Generate a genes gff file using BioCyc data given as flat-files
diff --git a/bin/generate_transcripts_gff.py b/bin/generate_transcripts_gff.py
index ee7fdd0..9fb6c5a 100644
--- a/bin/generate_transcripts_gff.py
+++ b/bin/generate_transcripts_gff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Generate a gff file of transcripts using an BioCyc data flat-files
@@ -48,7 +48,7 @@ def main(argv=None):
             sys.stdout, utr_len=settings.est_utr_lens,
             ec_dir=settings.bc_dir, chr_dict=chr_dict)
     except ValueError as e:
-        print e.msg
+        print(e.msg)
         return 1
     # application code here, like:
     # run(settings, args)
diff --git a/bin/get_sequences_for_meme.py b/bin/get_sequences_for_meme.py
index 6295b5f..ffa1fc9 100644
--- a/bin/get_sequences_for_meme.py
+++ b/bin/get_sequences_for_meme.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Using the summary file (generated by RILseq_significant_regions.py) to
diff --git a/bin/map_chimeric_fragments.py b/bin/map_chimeric_fragments.py
index a449d37..15a6dcf 100644
--- a/bin/map_chimeric_fragments.py
+++ b/bin/map_chimeric_fragments.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 After a library is mapped to the genome (using map_single_fragments.py or any
@@ -210,7 +210,6 @@ def main(argv=None):
                     settings.samse_params,
                     settings.samtools_cmd
                 )
-
             bamin = pysam.AlignmentFile(bamname, 'rb')
             reads_in.append(RILseq.read_bam_file(
                 bamin,
@@ -225,7 +224,6 @@ def main(argv=None):
         # I'm not aware of it is written this way
         chrnames_bam = bfin.references
         chrlens_bam = [bfin.get_reference_length(ref) for ref in chrnames_bam]
-
         RILseq.write_reads_table(
             sys.stdout,
             reads_in[0],
diff --git a/bin/map_single_fragments.py b/bin/map_single_fragments.py
index 8a0bd2c..6345b9e 100644
--- a/bin/map_single_fragments.py
+++ b/bin/map_single_fragments.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 This script is used to map fastq files to the genome. The input is a comma
diff --git a/bin/plot_circos_plot.py b/bin/plot_circos_plot.py
index 90ec70d..40b42d7 100644
--- a/bin/plot_circos_plot.py
+++ b/bin/plot_circos_plot.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Read the list of chimeric interactions and generate a file that can be read
diff --git a/bin/plot_regions_interactions.py b/bin/plot_regions_interactions.py
index 4ee160f..9eaab8a 100755
--- a/bin/plot_regions_interactions.py
+++ b/bin/plot_regions_interactions.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 """
 Read files with interactions summary (with or without p-values) and plot
@@ -71,10 +71,10 @@ def get_singles_counts(sname, seglen, mincounts):
             if ints < mincounts:
                 continue
             r1_reg = (
-                line['RNA1 chromosome'],int(line['Start of RNA1 first read'])/seglen*seglen,
+                line['RNA1 chromosome'],int(int(line['Start of RNA1 first read'])/seglen)*seglen,
                 line['RNA1 strand'])
             r2_reg = (
-                line['RNA2 chromosome'],int(line['Start of RNA2 last read'])/seglen*seglen,
+                line['RNA2 chromosome'],int(int(line['Start of RNA2 last read'])/seglen)*seglen,
                 line['RNA2 strand'])
             counts[r1_reg] += ints
             counts[r2_reg] += ints
@@ -96,9 +96,9 @@ def get_regions_counts(fname, seglen, mincounts):
             if int(line['interactions']) < mincounts:
                 continue
             t_reg = (
-                line['RNA1 chromosome'],int(line['Start of RNA1 first read'])/seglen*seglen,
+                line['RNA1 chromosome'],int(int(line['Start of RNA1 first read'])/seglen)*seglen,
                 line['RNA1 strand'], 
-                line['RNA2 chromosome'],int(line['Start of RNA2 last read'])/seglen*seglen,
+                line['RNA2 chromosome'],int(int(line['Start of RNA2 last read'])/seglen)*seglen,
                 line['RNA2 strand'])
 
             counts[t_reg] = int(line['interactions'])
@@ -137,8 +137,8 @@ def scatit(dname, dname2, grd, i, j, l1, l2, lln, cur_name):
 #        grd.cbar_axes[i*lln+j].colorbar(im)
 
         grd[i*lln + j].text(10, 10e4, "r=%.2f"%(spr[0]), size=6, color='m')
-        grd[i*lln + j].set_xlim([-10, 10e5])
-        grd[i*lln + j].set_ylim([-10, 10e5])
+        grd[i*lln + j].set_xlim([10e0, 10e5])
+        grd[i*lln + j].set_ylim([10e0, 10e5])
         grd[i*lln + j].set_yscale('log')
         grd[i*lln + j].set_xscale('log')
         grd[i*lln + j].set_xticks([10e0, 10e2, 10e4])
diff --git a/bin/post_processing.py b/bin/post_processing.py
index 32675fb..72aa449 100644
--- a/bin/post_processing.py
+++ b/bin/post_processing.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 This script is used to run post processing analysis on the RILseq results.
 This will generate table S1 in the RILseq essay.
@@ -107,7 +108,7 @@ def generate_table_s1(settings, outfile):
             total_reads = 0
 
         procfile = libname + '_cutadapt_1.fastq.gz'
-        processed_reads = sum(1 for line in gzip.open(procfile) if line.startswith('@'))
+        processed_reads = sum(1 for line in gzip.open(procfile) if line.startswith(b'@'))
         if os.path.exists(procfile.replace('_1.fastq.gz', '_2.fastq.gz')):
             processed_reads *= 2
         if total_reads > 0:
@@ -125,8 +126,8 @@ def generate_table_s1(settings, outfile):
         count_cmd_strict = count_cmd + ['-t']
 
         print(count_cmd)
-        mapped_reads = Popen(' '.join(count_cmd), shell=True, stdout=PIPE).stdout.read().strip()
-        mapped_reads_strict, multiple, antisense, igr = Popen(' '.join(count_cmd_strict), shell=True, stdout=PIPE)\
+        mapped_reads = Popen(' '.join(count_cmd), shell=True, stdout=PIPE, encoding='utf8').stdout.read().strip()
+        mapped_reads_strict, multiple, antisense, igr = Popen(' '.join(count_cmd_strict), shell=True, stdout=PIPE, encoding='utf8')\
             .stdout.read().strip().split(',')
 
         mapped_fraction = "{:1.2f}".format(int(mapped_reads)/float(processed_reads))
@@ -138,26 +139,26 @@ def generate_table_s1(settings, outfile):
 
         if settings.RILseq_work_dir:
             short_name = libname.split('/')[-1]
-            fastq1 = settings.RILseq_work_dir+short_name+"_cutadapt_bwa_ends_1.fastq"
-            fastq2 = settings.RILseq_work_dir+short_name+"_cutadapt_bwa_ends_2.fastq"
+            fastq1 = settings.RILseq_work_dir+"/remapped-data/"+short_name+"_cutadapt_bwa_ends_1.fastq"
+            fastq2 = settings.RILseq_work_dir+"/remapped-data/"+short_name+"_cutadapt_bwa_ends_2.fastq"
             if os.path.exists(fastq1):
                 numfrag = len(list(set([line.strip() for line in open(fastq1) if line.startswith('@')]).intersection(
                         [line.strip() for line in open(fastq2) if line.startswith('@')])))
 
-                frags_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt'
+                frags_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt'
                 singles = sum(1 for line in open(frags_file) if "single" in line)
                 chimeras = sum(1 for line in open(frags_file) if "chimera" in line)
 
-                singles_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_only_single.txt'
-                all_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_all_interactions.txt'
-                sig_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_all_fragments_l25.txt_sig_interactions_with-total.txt'
+                singles_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_singles.txt'
+                all_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_all_interactions.txt'
+                sig_file = settings.RILseq_work_dir+short_name+'_cutadapt_bwa.bam_mapping_all_fragments_l25.txt_significant_interactions.txt'
 
                 temp = open(singles_file)
-                temp.next()
+                next(temp)
                 map_si_exc_rrna = sum(int(line.split('\t')[14]) for line in temp)
 
                 temp = open(all_file)
-                temp.next()
+                next(temp)
                 map_ch_exc_rrna = sum(int(line.split('\t')[14]) for line in temp)
 
                 per_mapped = "{:1.1f}%".format(((singles+chimeras)/float(numfrag))*100)
@@ -165,7 +166,7 @@ def generate_table_s1(settings, outfile):
 
                 stat_interactions = sum(1 for line in open(sig_file)) - 1
                 temp = open(sig_file)
-                temp.next()
+                next(temp)
                 stat_frags = sum(int(line.split('\t')[14]) for line in temp)
                 per_stat = "{:1.1f}%".format((stat_frags/float(chimeras))*100)
 
diff --git a/setup.py b/setup.py
index 4e8b549..1ffc81b 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ def readme():
         return f.read()
 
 setup(name='RILseq',
-      version='0.81',
+      version='0.82',
       description='Processing RILSeq experiments results',
       long_description=readme(),
       classifiers=[