SpatialTranscriptomicsResearch · jfnavarro · May 10, 2019 · May 10, 2019 · May 10, 2019
diff --git a/CHANGELOG b/CHANGELOG
@@ -468,4 +468,7 @@ Version 1.7.2
 
 Version 1.7.3
 * Fixed a small bug when soring the output of taggd
-* Improved the st_qa.py script
+* Improved the st_qa.py script
+
+Version 1.7.5
+* Ported to Python 3
diff --git a/README.md b/README.md
@@ -47,7 +47,8 @@ The ST pipeline will also output a log file with useful information.
 **Installation**
 
 We recommend you install a virtual environment like Pyenv or Anaconda before you install the pipeline. 
-The ST Pipeline works with python 2.7.
+
+The ST Pipeline works with python 3.6 or bigger.
 
 You can install the ST Pipeline using PyPy:
 
@@ -84,7 +85,7 @@ To see the different options type
 
 An example run would be
 
-	st_pipeline_run.py --ids ids_file.txt --ref-map path_to_index --log-file log_file.txt --output-folder /home/me/results --ref-annotation annotation_file.gtf file1.fastq file2.fastq 
+	st_pipeline_run.py --expName test --ids ids_file.txt --ref-map path_to_index --log-file log_file.txt --output-folder /home/me/results --ref-annotation annotation_file.gtf file1.fastq file2.fastq 
 
 **Emsembl ids**
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ scipy
 scikit-learn
 sqlitedict
 regex
-taggd>=0.3.3
+taggd>=0.3.6
 HTSeq>=0.7.1
 pysam>=0.7.4
 setuptools

diff --git a/scripts/filter_gene_type_matrix.py b/scripts/filter_gene_type_matrix.py
@@ -50,7 +50,7 @@ def main(counts_matrix, gene_types_keep, outfile, annotation, ensembl_ids):
     if len(genes_drop) > 0:
         counts_table.drop(genes_drop, axis=1, inplace=True)
     else:
-        print "Not a single gene could be discarded..."
+        print("Not a single gene could be discarded...")
     # Write filtered table
     counts_table.to_csv(outfile, sep='\t')
 

diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 import sys
 from setuptools import setup, find_packages
 from stpipeline.version import version_number
-from distutils.extension import Extension
+from distutils.core import setup, Extension
 
 here = os.path.abspath(os.path.dirname(__file__))
 
@@ -33,14 +33,14 @@
     raise SystemExit("Could not find requirements.txt file")
 
 major, minor1, minor2, s, tmp = sys.version_info
-if major != 2 or minor1 < 7:
-    raise SystemExit("ST Pipeline requires Python 2.7.x")
+if major != 3 or minor1 < 6:
+    raise SystemExit("ST Pipeline requires Python 3.6 or bigger")
 
 # setuptools DWIM monkey-patch madness
 # http://mail.python.org/pipermail/distutils-sig/2007-September/thread.html#8204
-if 'setuptools.extension' in sys.modules:
-    m = sys.modules['setuptools.extension']
-    m.Extension.__dict__ = m._Extension.__dict__
+#if 'setuptools.extension' in sys.modules:
+#    m = sys.modules['setuptools.extension']
+#    m.Extension.__dict__ = m._Extension.__dict__
 
 setup(
   name = 'stpipeline',
@@ -68,9 +68,10 @@
     'Development Status :: 5 - Production/Stable',
     'Intended Audience :: Science/Research',
     'Topic :: Software Development',
-    'Topic :: Scientific/Engineering :: Bio-Informatics',
+    'Topic :: Scientific/Engineering :: Bionformatics :: Spatial Transcriptomics',
     'License :: OSI Approved :: MIT License',
-    'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
     'Operating System :: Unix',
     'Operating System :: MacOS',
     'Environment :: Console',

diff --git a/stpipeline/common/clustering.py b/stpipeline/common/clustering.py
@@ -35,7 +35,8 @@ def countUMIHierarchical(molecular_barcodes,
     # Distance computation function
     def d(coord):
         i,j = coord
-        return hamming_distance(molecular_barcodes[i], molecular_barcodes[j])
+        return hamming_distance(molecular_barcodes[i].encode("UTF-8"), 
+                                molecular_barcodes[j].encode("UTF-8"))
     # Create hierarchical clustering and obtain flat clusters at the distance given
     indices = np.triu_indices(len(molecular_barcodes), 1)
     distance_matrix = np.apply_along_axis(d, 0, indices)
@@ -45,7 +46,7 @@ def d(coord):
     items = defaultdict(list)
     for i, item in enumerate(flat_clusters):
         items[item].append(i)
-    return [molecular_barcodes[random.choice(members)] for members in items.itervalues()]
+    return [molecular_barcodes[random.choice(members)] for members in list(items.values())]
 
 def countUMINaive(molecular_barcodes, allowed_mismatches):
     """
@@ -73,13 +74,14 @@ def countUMINaive(molecular_barcodes, allowed_mismatches):
             # compare distant of previous molecular barcodes and new one
             # if distance is between threshold we add it to the cluster 
             # otherwise we create a new cluster
-            if hamming_distance(clusters_dict[nclusters][-1], molecular_barcode) <= allowed_mismatches:
+            if hamming_distance(clusters_dict[nclusters][-1].encode("UTF-8"), 
+                                molecular_barcode.encode("UTF-8")) <= allowed_mismatches:
                 clusters_dict[nclusters].append(molecular_barcode)
             else:
                 nclusters += 1
                 clusters_dict[nclusters] = [molecular_barcode]
     # Return the non clustered UMIs
-    return [random.choice(members) for members in clusters_dict.itervalues()]
+    return [random.choice(members) for members in list(clusters_dict.values())]
 
 def breadth_first_search(node, adj_list):
     """ This function has been obtained from 
@@ -118,7 +120,8 @@ def dedup_adj(molecular_barcodes, allowed_mismatches):
     c = Counter(molecular_barcodes)
 
     def get_adj_list_adjacency(umis):
-        return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) \
+        return {umi: [umi2 for umi2 in umis if hamming_distance(umi.encode("UTF-8"), 
+                                                                umi2.encode("UTF-8")) \
                       <= allowed_mismatches] for umi in umis}
 
     def get_connected_components_adjacency(graph, Counter):
@@ -163,7 +166,8 @@ def dedup_dir_adj(molecular_barcodes, allowed_mismatches):
     c = Counter(molecular_barcodes)
 
     def get_adj_list_directional_adjacency(umis, counts):
-        return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) <= allowed_mismatches and
+        return {umi: [umi2 for umi2 in umis if hamming_distance(umi.encode("UTF-8"), 
+                                                                umi2.encode("UTF-8")) <= allowed_mismatches and
                       counts[umi] >= (counts[umi2]*2)-1] for umi in umis}  
 
     def get_connected_components_adjacency(graph, Counter):
@@ -196,7 +200,8 @@ def affinity_umi_removal(molecular_barcodes, _):
     if len(molecular_barcodes) <= 2:
         return countUMINaive(molecular_barcodes, allowed_mismatches)
     words = np.asarray(molecular_barcodes)
-    lev_similarity = -1 * np.array([[hamming_distance(w1,w2) for w1 in words] for w2 in words])
+    lev_similarity = -1 * np.array([[hamming_distance(w1.encode("UTF-8"),
+                                                      w2.encode("UTF-8")) for w1 in words] for w2 in words])
     affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
     affprop.fit(lev_similarity)
     unique_clusters = list()

diff --git a/stpipeline/common/dataset.py b/stpipeline/common/dataset.py
@@ -29,22 +29,22 @@ def computeUniqueUMIs(transcripts, umi_counting_offset, umi_allowed_mismatches,
     # size variability and then group the rest of transcripts normally by (strand, start, position).
     unique_transcripts = list()
     num_transcripts = len(transcripts)
-    for i in xrange(num_transcripts - 1):
+    for i in range(num_transcripts - 1):
         current = sorted_transcripts[i]
         nextone = sorted_transcripts[i + 1]
         grouped_transcripts[current[6]].append(current)
         if abs(current[1] - nextone[1]) > umi_counting_offset or current[5] != nextone[5]:
             # A new group has been reached (strand, start-pos, offset)
             # Compute unique UMIs by hamming distance
-            unique_umis = group_umi_func(grouped_transcripts.keys(), umi_allowed_mismatches)
+            unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
             # Choose 1 random transcript for the clustered transcripts (by UMI)
             unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
             # Reset the container
             grouped_transcripts = defaultdict(list)
     # We process the last one and more transcripts if they were not processed
     lastone = sorted_transcripts[num_transcripts - 1]
     grouped_transcripts[lastone[6]].append(lastone)
-    unique_umis = group_umi_func(grouped_transcripts.keys(), umi_allowed_mismatches)
+    unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
     unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
     return unique_transcripts
 
@@ -124,18 +124,19 @@ def createDataset(input_file,
     list_row_values = list()
     list_indexes = list()   
 
-    # Parse unique events to generate the unique counts and the BED file   
+    # Parse unique events to generate the unique counts and the BED file
     unique_events = parse_unique_events(input_file, gff_filename)
     with open(os.path.join(output_folder, filenameReadsBED), "w") as reads_handler:
         # this is the generator returning a dictionary with spots for each gene
-        for gene, spots in unique_events: 
+        for gene, spots in unique_events:
             transcript_counts_by_spot = {}
-            for spot_coordinates, reads in spots.iteritems():
+            for spot_coordinates, reads in list(spots.items()):
                 (x,y) = spot_coordinates
                 # Re-compute the read count accounting for duplicates using the UMIs
                 # Transcripts is the list of transcripts (chrom, start, end, clear_name, mapping_quality, strand, UMI)
                 # First:
                 # Get the original number of transcripts (reads)
+                reads = list(reads)
                 read_count = len(reads)
                 if not diable_umi:
                     # Compute unique transcripts (based on UMI, strand and start position +- threshold)

diff --git a/stpipeline/common/fastq_utils.py b/stpipeline/common/fastq_utils.py
@@ -8,7 +8,6 @@
 from stpipeline.common.sam_utils import convert_to_AlignedSegment
 from stpipeline.common.stats import qa_stats
 import logging 
-from itertools import izip
 from sqlitedict import SqliteDict
 import os
 import re
@@ -20,7 +19,7 @@ def coroutine(func):
     """
     def start(*args, **kwargs):
         cr = func(*args, **kwargs)
-        cr.next()
+        cr.__next__()
         return cr
     return start
 
@@ -125,7 +124,7 @@ def quality_trim_index(bases, qualities, cutoff, base=33):
     s = 0
     max_qual = 0
     max_i = len(qualities)
-    for i in reversed(xrange(max_i)):
+    for i in reversed(range(max_i)):
         q = ord(qualities[i]) - base
         if bases[i] == 'G':
             q = cutoff - 1

diff --git a/stpipeline/common/filterInputReads.pyx b/stpipeline/common/filterInputReads.pyx
@@ -9,7 +9,6 @@ from stpipeline.common.fastq_utils import *
 from stpipeline.common.sam_utils import convert_to_AlignedSegment
 from stpipeline.common.adaptors import removeAdaptor
 from stpipeline.common.stats import qa_stats
-from itertools import izip
 
 bam_header = {
         'HD': {'VN': '1.5', 'SO':'unsorted'},
@@ -86,11 +85,11 @@ def InputReadsFilter(fw,
     cdef bint keep_discarded_files = out_rv_discarded is not None
 
     # Build fake sequence adaptors with the parameters given
-    cdef str adaptorA = "".join("A" for k in xrange(polyA_min_distance))
-    cdef str adaptorT = "".join("T" for k in xrange(polyT_min_distance))
-    cdef str adaptorG = "".join("G" for k in xrange(polyG_min_distance))
-    cdef str adaptorC = "".join("C" for k in xrange(polyC_min_distance))
-    cdef str adaptorN = "".join("N" for k in xrange(polyN_min_distance))
+    cdef str adaptorA = "".join("A" for k in range(polyA_min_distance))
+    cdef str adaptorT = "".join("T" for k in range(polyT_min_distance))
+    cdef str adaptorG = "".join("G" for k in range(polyG_min_distance))
+    cdef str adaptorC = "".join("C" for k in range(polyC_min_distance))
+    cdef str adaptorN = "".join("N" for k in range(polyN_min_distance))
 
     # Not recommended to do adaptor trimming for adaptors smaller than 5
     cdef bint do_adaptorA = polyA_min_distance >= 5
@@ -133,7 +132,7 @@ def InputReadsFilter(fw,
         out_rv_writer_discarded = writefq(out_rv_handle_discarded)
 
     for (header_fw, sequence_fw, quality_fw), \
-    (header_rv, sequence_rv, quality_rv) in izip(readfq(fw_file), readfq(rv_file)):
+    (header_rv, sequence_rv, quality_rv) in zip(readfq(fw_file), readfq(rv_file)):
 
         discard_read = False
         orig_sequence_rv, orig_quality_rv = sequence_rv, quality_rv

diff --git a/stpipeline/common/saturation.py b/stpipeline/common/saturation.py
@@ -68,7 +68,7 @@ def computeSaturation(nreads,
     else:
          # Create a list of 15 saturation points (different number of reads)
         saturation_points = list()
-        for x in xrange(0,15):
+        for x in range(0,15):
             spoint = int(math.floor(1e5 + (math.exp(x) * 1e5)))
             if spoint >= int(nreads):
                 break
@@ -95,7 +95,7 @@ def computeSaturation(nreads,
         file_names[spoint] = file_name
         files[spoint] = output_sam
         # Generate a list of indexes in the sam file to extract sub samples 
-        indices = list(xrange(int(nreads)))
+        indices = list(range(int(nreads)))
         random.shuffle(indices)
         subbed = indices[0:spoint]
         subbed.sort()
@@ -115,7 +115,7 @@ def computeSaturation(nreads,
 
     # Close the files
     annotated_sam.close()
-    for file_sam in files.itervalues():
+    for file_sam in list(files.values()):
         file_sam.close()
 
     # Compute saturation points by calling createDataset on each file
@@ -151,7 +151,7 @@ def computeSaturation(nreads,
         saturation_points_average_reads.append(stats.average_reads_feature)
 
     # Remove the files
-    for file_sam in file_names.itervalues():
+    for file_sam in list(file_names.values()):
         safeRemove(file_sam)     
 
     # Update the log with the computed saturation points

diff --git a/stpipeline/common/stats.py b/stpipeline/common/stats.py
@@ -4,7 +4,7 @@
 the pipeline run.
 """
 import json
-    
+
 class Stats():
     """
     Stats is meant to be used

diff --git a/stpipeline/common/unique_events_parser.pyx b/stpipeline/common/unique_events_parser.pyx
@@ -134,7 +134,7 @@ class geneBuffer():
         If so the gene will be returned in a list and deleted from the buffer
         :param empty: when True if forces to empty the buffer
         """
-        cdef list _tmp = self.buffer.keys()
+        cdef list _tmp = list(self.buffer.keys())
         cdef gene_transcripts = list()
         cdef str chrom
         cdef int end_position
@@ -219,7 +219,7 @@ def parse_unique_events(input_file, gff_filename=None):
         transcript = (chrom, start, end, clear_name, mapping_quality, strand, umi)
         if gff_filename is not None:
             genes_buffer.add_transcript(gene, (x,y), transcript, rec.reference_start)
-            for g, t in genes_buffer.check_and_clear_buffer():
+            for g, t in list(genes_buffer.check_and_clear_buffer()):
                 yield (g, t)
         else:
             try:
@@ -236,6 +236,6 @@ def parse_unique_events(input_file, gff_filename=None):
         for g, t in genes_buffer.check_and_clear_buffer(True):
             yield (g, t)
     else:
-        for (g,t) in genes_dict.iteritems():
+        for (g,t) in list(genes_dict.items()):
             yield (g,t)
 
diff --git a/stpipeline/core/annotation.py b/stpipeline/core/annotation.py
@@ -15,7 +15,6 @@
 import operator
 import math
 import time
-from itertools import izip
 
 class UnknownChrom( Exception ):
     pass
@@ -27,7 +26,7 @@ def invert_strand( iv ):
     elif iv2.strand == "-":
         iv2.strand = "+"
     else:
-        raise ValueError, "Illegal strand"  
+        raise ValueError("Illegal strand")
     return iv2
 
 def count_reads_in_features(sam_filename, 
@@ -112,10 +111,10 @@ def write_to_samout(read, assignment):
                 try:
                     feature_id = f.attr[id_attribute]
                 except KeyError:
-                    raise ValueError, ("Feature %s does not contain a '%s' attribute" \
+                    raise ValueError("Feature %s does not contain a '%s' attribute" \
                                        % (f.name, id_attribute))
                 if stranded != "no" and f.iv.strand == ".":
-                    raise ValueError, ("Feature %s at %s does not have strand information but you are " \
+                    raise ValueError("Feature %s at %s does not have strand information but you are " \
                                        "running htseq-count in stranded mode. Use '--stranded=no'." % 
                                        (f.name, f.iv))
                 features[f.iv] += feature_id
@@ -124,19 +123,19 @@ def write_to_samout(read, assignment):
         raise
 
     if len(counts) == 0:
-        raise RuntimeError, "No features of type '%s' found.\n" % feature_type
+        raise RuntimeError("No features of type '%s' found.\n" % feature_type)
 
     if samtype == "sam":
         SAM_or_BAM_Reader = HTSeq.SAM_Reader
     elif samtype == "bam":
         SAM_or_BAM_Reader = HTSeq.BAM_Reader
     else:
-        raise ValueError, "Unknown input format %s specified." % samtype
+        raise ValueError("Unknown input format %s specified." % samtype)
 
     try:
         read_seq = SAM_or_BAM_Reader(sam_filename)
     except:
-        raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."
+        raise RuntimeError("Error occurred when reading beginning of SAM/BAM file.")
 
     try:
 
@@ -173,7 +172,7 @@ def write_to_samout(read, assignment):
                                 else:
                                     fs = fs.intersection(fs2)
                 else:
-                    raise RuntimeError, "Illegal overlap mode."
+                    raise RuntimeError("Illegal overlap mode.")
 
                 if fs is None or len(fs) == 0:
                     write_to_samout(r, "__no_feature")