Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -468,4 +468,7 @@ Version 1.7.2

Version 1.7.3
* Fixed a small bug when soring the output of taggd
* Improved the st_qa.py script
* Improved the st_qa.py script

Version 1.7.5
* Ported to Python 3
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ The ST pipeline will also output a log file with useful information.
**Installation**

We recommend you install a virtual environment like Pyenv or Anaconda before you install the pipeline.
The ST Pipeline works with python 2.7.

The ST Pipeline works with python 3.6 or bigger.

You can install the ST Pipeline using PyPy:

Expand Down Expand Up @@ -84,7 +85,7 @@ To see the different options type

An example run would be

st_pipeline_run.py --ids ids_file.txt --ref-map path_to_index --log-file log_file.txt --output-folder /home/me/results --ref-annotation annotation_file.gtf file1.fastq file2.fastq
st_pipeline_run.py --expName test --ids ids_file.txt --ref-map path_to_index --log-file log_file.txt --output-folder /home/me/results --ref-annotation annotation_file.gtf file1.fastq file2.fastq

**Emsembl ids**

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ scipy
scikit-learn
sqlitedict
regex
taggd>=0.3.3
taggd>=0.3.6
HTSeq>=0.7.1
pysam>=0.7.4
setuptools
Expand Down
2 changes: 1 addition & 1 deletion scripts/filter_gene_type_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def main(counts_matrix, gene_types_keep, outfile, annotation, ensembl_ids):
if len(genes_drop) > 0:
counts_table.drop(genes_drop, axis=1, inplace=True)
else:
print "Not a single gene could be discarded..."
print("Not a single gene could be discarded...")
# Write filtered table
counts_table.to_csv(outfile, sep='\t')

Expand Down
17 changes: 9 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import sys
from setuptools import setup, find_packages
from stpipeline.version import version_number
from distutils.extension import Extension
from distutils.core import setup, Extension

here = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -33,14 +33,14 @@
raise SystemExit("Could not find requirements.txt file")

major, minor1, minor2, s, tmp = sys.version_info
if major != 2 or minor1 < 7:
raise SystemExit("ST Pipeline requires Python 2.7.x")
if major != 3 or minor1 < 6:
raise SystemExit("ST Pipeline requires Python 3.6 or bigger")

# setuptools DWIM monkey-patch madness
# http://mail.python.org/pipermail/distutils-sig/2007-September/thread.html#8204
if 'setuptools.extension' in sys.modules:
m = sys.modules['setuptools.extension']
m.Extension.__dict__ = m._Extension.__dict__
#if 'setuptools.extension' in sys.modules:
# m = sys.modules['setuptools.extension']
# m.Extension.__dict__ = m._Extension.__dict__

setup(
name = 'stpipeline',
Expand Down Expand Up @@ -68,9 +68,10 @@
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Science/Research',
'Topic :: Software Development',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'Topic :: Scientific/Engineering :: Bionformatics :: Spatial Transcriptomics',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Operating System :: Unix',
'Operating System :: MacOS',
'Environment :: Console',
Expand Down
19 changes: 12 additions & 7 deletions stpipeline/common/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def countUMIHierarchical(molecular_barcodes,
# Distance computation function
def d(coord):
i,j = coord
return hamming_distance(molecular_barcodes[i], molecular_barcodes[j])
return hamming_distance(molecular_barcodes[i].encode("UTF-8"),
molecular_barcodes[j].encode("UTF-8"))
# Create hierarchical clustering and obtain flat clusters at the distance given
indices = np.triu_indices(len(molecular_barcodes), 1)
distance_matrix = np.apply_along_axis(d, 0, indices)
Expand All @@ -45,7 +46,7 @@ def d(coord):
items = defaultdict(list)
for i, item in enumerate(flat_clusters):
items[item].append(i)
return [molecular_barcodes[random.choice(members)] for members in items.itervalues()]
return [molecular_barcodes[random.choice(members)] for members in list(items.values())]

def countUMINaive(molecular_barcodes, allowed_mismatches):
"""
Expand Down Expand Up @@ -73,13 +74,14 @@ def countUMINaive(molecular_barcodes, allowed_mismatches):
# compare distant of previous molecular barcodes and new one
# if distance is between threshold we add it to the cluster
# otherwise we create a new cluster
if hamming_distance(clusters_dict[nclusters][-1], molecular_barcode) <= allowed_mismatches:
if hamming_distance(clusters_dict[nclusters][-1].encode("UTF-8"),
molecular_barcode.encode("UTF-8")) <= allowed_mismatches:
clusters_dict[nclusters].append(molecular_barcode)
else:
nclusters += 1
clusters_dict[nclusters] = [molecular_barcode]
# Return the non clustered UMIs
return [random.choice(members) for members in clusters_dict.itervalues()]
return [random.choice(members) for members in list(clusters_dict.values())]

def breadth_first_search(node, adj_list):
""" This function has been obtained from
Expand Down Expand Up @@ -118,7 +120,8 @@ def dedup_adj(molecular_barcodes, allowed_mismatches):
c = Counter(molecular_barcodes)

def get_adj_list_adjacency(umis):
return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) \
return {umi: [umi2 for umi2 in umis if hamming_distance(umi.encode("UTF-8"),
umi2.encode("UTF-8")) \
<= allowed_mismatches] for umi in umis}

def get_connected_components_adjacency(graph, Counter):
Expand Down Expand Up @@ -163,7 +166,8 @@ def dedup_dir_adj(molecular_barcodes, allowed_mismatches):
c = Counter(molecular_barcodes)

def get_adj_list_directional_adjacency(umis, counts):
return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) <= allowed_mismatches and
return {umi: [umi2 for umi2 in umis if hamming_distance(umi.encode("UTF-8"),
umi2.encode("UTF-8")) <= allowed_mismatches and
counts[umi] >= (counts[umi2]*2)-1] for umi in umis}

def get_connected_components_adjacency(graph, Counter):
Expand Down Expand Up @@ -196,7 +200,8 @@ def affinity_umi_removal(molecular_barcodes, _):
if len(molecular_barcodes) <= 2:
return countUMINaive(molecular_barcodes, allowed_mismatches)
words = np.asarray(molecular_barcodes)
lev_similarity = -1 * np.array([[hamming_distance(w1,w2) for w1 in words] for w2 in words])
lev_similarity = -1 * np.array([[hamming_distance(w1.encode("UTF-8"),
w2.encode("UTF-8")) for w1 in words] for w2 in words])
affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
unique_clusters = list()
Expand Down
13 changes: 7 additions & 6 deletions stpipeline/common/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,22 @@ def computeUniqueUMIs(transcripts, umi_counting_offset, umi_allowed_mismatches,
# size variability and then group the rest of transcripts normally by (strand, start, position).
unique_transcripts = list()
num_transcripts = len(transcripts)
for i in xrange(num_transcripts - 1):
for i in range(num_transcripts - 1):
current = sorted_transcripts[i]
nextone = sorted_transcripts[i + 1]
grouped_transcripts[current[6]].append(current)
if abs(current[1] - nextone[1]) > umi_counting_offset or current[5] != nextone[5]:
# A new group has been reached (strand, start-pos, offset)
# Compute unique UMIs by hamming distance
unique_umis = group_umi_func(grouped_transcripts.keys(), umi_allowed_mismatches)
unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
# Choose 1 random transcript for the clustered transcripts (by UMI)
unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
# Reset the container
grouped_transcripts = defaultdict(list)
# We process the last one and more transcripts if they were not processed
lastone = sorted_transcripts[num_transcripts - 1]
grouped_transcripts[lastone[6]].append(lastone)
unique_umis = group_umi_func(grouped_transcripts.keys(), umi_allowed_mismatches)
unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
return unique_transcripts

Expand Down Expand Up @@ -124,18 +124,19 @@ def createDataset(input_file,
list_row_values = list()
list_indexes = list()

# Parse unique events to generate the unique counts and the BED file
# Parse unique events to generate the unique counts and the BED file
unique_events = parse_unique_events(input_file, gff_filename)
with open(os.path.join(output_folder, filenameReadsBED), "w") as reads_handler:
# this is the generator returning a dictionary with spots for each gene
for gene, spots in unique_events:
for gene, spots in unique_events:
transcript_counts_by_spot = {}
for spot_coordinates, reads in spots.iteritems():
for spot_coordinates, reads in list(spots.items()):
(x,y) = spot_coordinates
# Re-compute the read count accounting for duplicates using the UMIs
# Transcripts is the list of transcripts (chrom, start, end, clear_name, mapping_quality, strand, UMI)
# First:
# Get the original number of transcripts (reads)
reads = list(reads)
read_count = len(reads)
if not diable_umi:
# Compute unique transcripts (based on UMI, strand and start position +- threshold)
Expand Down
5 changes: 2 additions & 3 deletions stpipeline/common/fastq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from stpipeline.common.sam_utils import convert_to_AlignedSegment
from stpipeline.common.stats import qa_stats
import logging
from itertools import izip
from sqlitedict import SqliteDict
import os
import re
Expand All @@ -20,7 +19,7 @@ def coroutine(func):
"""
def start(*args, **kwargs):
cr = func(*args, **kwargs)
cr.next()
cr.__next__()
return cr
return start

Expand Down Expand Up @@ -125,7 +124,7 @@ def quality_trim_index(bases, qualities, cutoff, base=33):
s = 0
max_qual = 0
max_i = len(qualities)
for i in reversed(xrange(max_i)):
for i in reversed(range(max_i)):
q = ord(qualities[i]) - base
if bases[i] == 'G':
q = cutoff - 1
Expand Down
13 changes: 6 additions & 7 deletions stpipeline/common/filterInputReads.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ from stpipeline.common.fastq_utils import *
from stpipeline.common.sam_utils import convert_to_AlignedSegment
from stpipeline.common.adaptors import removeAdaptor
from stpipeline.common.stats import qa_stats
from itertools import izip

bam_header = {
'HD': {'VN': '1.5', 'SO':'unsorted'},
Expand Down Expand Up @@ -86,11 +85,11 @@ def InputReadsFilter(fw,
cdef bint keep_discarded_files = out_rv_discarded is not None

# Build fake sequence adaptors with the parameters given
cdef str adaptorA = "".join("A" for k in xrange(polyA_min_distance))
cdef str adaptorT = "".join("T" for k in xrange(polyT_min_distance))
cdef str adaptorG = "".join("G" for k in xrange(polyG_min_distance))
cdef str adaptorC = "".join("C" for k in xrange(polyC_min_distance))
cdef str adaptorN = "".join("N" for k in xrange(polyN_min_distance))
cdef str adaptorA = "".join("A" for k in range(polyA_min_distance))
cdef str adaptorT = "".join("T" for k in range(polyT_min_distance))
cdef str adaptorG = "".join("G" for k in range(polyG_min_distance))
cdef str adaptorC = "".join("C" for k in range(polyC_min_distance))
cdef str adaptorN = "".join("N" for k in range(polyN_min_distance))

# Not recommended to do adaptor trimming for adaptors smaller than 5
cdef bint do_adaptorA = polyA_min_distance >= 5
Expand Down Expand Up @@ -133,7 +132,7 @@ def InputReadsFilter(fw,
out_rv_writer_discarded = writefq(out_rv_handle_discarded)

for (header_fw, sequence_fw, quality_fw), \
(header_rv, sequence_rv, quality_rv) in izip(readfq(fw_file), readfq(rv_file)):
(header_rv, sequence_rv, quality_rv) in zip(readfq(fw_file), readfq(rv_file)):

discard_read = False
orig_sequence_rv, orig_quality_rv = sequence_rv, quality_rv
Expand Down
8 changes: 4 additions & 4 deletions stpipeline/common/saturation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def computeSaturation(nreads,
else:
# Create a list of 15 saturation points (different number of reads)
saturation_points = list()
for x in xrange(0,15):
for x in range(0,15):
spoint = int(math.floor(1e5 + (math.exp(x) * 1e5)))
if spoint >= int(nreads):
break
Expand All @@ -95,7 +95,7 @@ def computeSaturation(nreads,
file_names[spoint] = file_name
files[spoint] = output_sam
# Generate a list of indexes in the sam file to extract sub samples
indices = list(xrange(int(nreads)))
indices = list(range(int(nreads)))
random.shuffle(indices)
subbed = indices[0:spoint]
subbed.sort()
Expand All @@ -115,7 +115,7 @@ def computeSaturation(nreads,

# Close the files
annotated_sam.close()
for file_sam in files.itervalues():
for file_sam in list(files.values()):
file_sam.close()

# Compute saturation points by calling createDataset on each file
Expand Down Expand Up @@ -151,7 +151,7 @@ def computeSaturation(nreads,
saturation_points_average_reads.append(stats.average_reads_feature)

# Remove the files
for file_sam in file_names.itervalues():
for file_sam in list(file_names.values()):
safeRemove(file_sam)

# Update the log with the computed saturation points
Expand Down
2 changes: 1 addition & 1 deletion stpipeline/common/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
the pipeline run.
"""
import json

class Stats():
"""
Stats is meant to be used
Expand Down
6 changes: 3 additions & 3 deletions stpipeline/common/unique_events_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class geneBuffer():
If so the gene will be returned in a list and deleted from the buffer
:param empty: when True if forces to empty the buffer
"""
cdef list _tmp = self.buffer.keys()
cdef list _tmp = list(self.buffer.keys())
cdef gene_transcripts = list()
cdef str chrom
cdef int end_position
Expand Down Expand Up @@ -219,7 +219,7 @@ def parse_unique_events(input_file, gff_filename=None):
transcript = (chrom, start, end, clear_name, mapping_quality, strand, umi)
if gff_filename is not None:
genes_buffer.add_transcript(gene, (x,y), transcript, rec.reference_start)
for g, t in genes_buffer.check_and_clear_buffer():
for g, t in list(genes_buffer.check_and_clear_buffer()):
yield (g, t)
else:
try:
Expand All @@ -236,6 +236,6 @@ def parse_unique_events(input_file, gff_filename=None):
for g, t in genes_buffer.check_and_clear_buffer(True):
yield (g, t)
else:
for (g,t) in genes_dict.iteritems():
for (g,t) in list(genes_dict.items()):
yield (g,t)

15 changes: 7 additions & 8 deletions stpipeline/core/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import operator
import math
import time
from itertools import izip

class UnknownChrom( Exception ):
pass
Expand All @@ -27,7 +26,7 @@ def invert_strand( iv ):
elif iv2.strand == "-":
iv2.strand = "+"
else:
raise ValueError, "Illegal strand"
raise ValueError("Illegal strand")
return iv2

def count_reads_in_features(sam_filename,
Expand Down Expand Up @@ -112,10 +111,10 @@ def write_to_samout(read, assignment):
try:
feature_id = f.attr[id_attribute]
except KeyError:
raise ValueError, ("Feature %s does not contain a '%s' attribute" \
raise ValueError("Feature %s does not contain a '%s' attribute" \
% (f.name, id_attribute))
if stranded != "no" and f.iv.strand == ".":
raise ValueError, ("Feature %s at %s does not have strand information but you are " \
raise ValueError("Feature %s at %s does not have strand information but you are " \
"running htseq-count in stranded mode. Use '--stranded=no'." %
(f.name, f.iv))
features[f.iv] += feature_id
Expand All @@ -124,19 +123,19 @@ def write_to_samout(read, assignment):
raise

if len(counts) == 0:
raise RuntimeError, "No features of type '%s' found.\n" % feature_type
raise RuntimeError("No features of type '%s' found.\n" % feature_type)

if samtype == "sam":
SAM_or_BAM_Reader = HTSeq.SAM_Reader
elif samtype == "bam":
SAM_or_BAM_Reader = HTSeq.BAM_Reader
else:
raise ValueError, "Unknown input format %s specified." % samtype
raise ValueError("Unknown input format %s specified." % samtype)

try:
read_seq = SAM_or_BAM_Reader(sam_filename)
except:
raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."
raise RuntimeError("Error occurred when reading beginning of SAM/BAM file.")

try:

Expand Down Expand Up @@ -173,7 +172,7 @@ def write_to_samout(read, assignment):
else:
fs = fs.intersection(fs2)
else:
raise RuntimeError, "Illegal overlap mode."
raise RuntimeError("Illegal overlap mode.")

if fs is None or len(fs) == 0:
write_to_samout(r, "__no_feature")
Expand Down
Loading