diff --git a/.travis.yml b/.travis.yml index 8ceb0e2..8aeafc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,8 +20,12 @@ before_install: - export TMPDIR=$HOME/tmp install: - - conda create -q -n testenv --yes python=2.7 numpy scipy pandas + - conda create -q -n testenv --yes python=2.7 numpy scipy pandas cython - source activate testenv + - conda config --add channels r + - conda config --add channels bioconda + - conda install pysam + - python setup.py build - python setup.py install script: diff --git a/setup.py b/setup.py index 8242c23..e0e2f13 100755 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ import sys from setuptools import setup, find_packages from stpipeline.version import version_number +from Cython.Build import cythonize # Get the long description from the relevant file here = os.path.abspath(os.path.dirname(__file__)) @@ -41,7 +42,8 @@ author_email = 'jose.fernandez.navarro@scilifelab.se', license = 'MIT', url = 'https://github.com/SpatialTranscriptomicsResearch/st_pipeline', - packages = find_packages(exclude=('tests*', 'utils')), + packages = find_packages(exclude=('tests*', 'utils', "*.pyx")), + ext_modules = cythonize("stpipeline/common/*.pyx"), include_package_data = False, package_data = {'': ['RELEASE-VERSION']}, zip_safe = False, diff --git a/stpipeline/common/cdistance.pyx b/stpipeline/common/cdistance.pyx new file mode 100644 index 0000000..18d8b8f --- /dev/null +++ b/stpipeline/common/cdistance.pyx @@ -0,0 +1,10 @@ +cpdef int hamming_distance(a, b): + cdef char * aa = a + cdef char * bb = b + cdef int k, l, c + c = 0 + l = len(a) + for k from 0 <= k < l: + if aa[k] != bb[k]: + c += 1 + return c \ No newline at end of file diff --git a/stpipeline/common/clustering.py b/stpipeline/common/clustering.py index 7a8c683..b8de564 100755 --- a/stpipeline/common/clustering.py +++ b/stpipeline/common/clustering.py @@ -6,7 +6,6 @@ import numpy as np from scipy.cluster.hierarchy import linkage,fcluster from collections import defaultdict -import pyximport; pyximport.install() from stpipeline.common.cdistance import hamming_distance import random from collections import Counter diff --git a/stpipeline/common/distance.py b/stpipeline/common/distance.py deleted file mode 100755 index d5e835a..0000000 --- a/stpipeline/common/distance.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -This module contains some functions for computing distance between -sequences -""" - -def hamming_distance(s1, s2): - """ - Returns the Hamming distance between equal-length sequences. - :param s1: the first string/sequence - :param s1: the second string/sequence - :type s1: str - :type s2: str - :return: the number of number of different elements in both sequences - :rtype: int - :raises: ValueError - """ - if len(s1) != len(s2): - raise ValueError("Undefined for sequences of unequal length") - return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) diff --git a/stpipeline/core/pipeline.py b/stpipeline/core/pipeline.py index 9901077..407fcaf 100755 --- a/stpipeline/core/pipeline.py +++ b/stpipeline/core/pipeline.py @@ -23,6 +23,7 @@ import bz2 import tempfile import shutil +import gc FILENAMES = {"mapped" : "mapped.bam", "annotated" : "annotated.bam", @@ -687,8 +688,8 @@ def run(self): raise #================================================================= - # STEP: OBTAIN HASH OF DEMULTIPLEXED READS - # Hash demultiplexed reads to obtain a hash of read_name => (barcode,x,y,umi) + # STEP: OBTAIN DICT OF DEMULTIPLEXED READS + # Iterate demultiplexed FASTQ reads to obtain a dict of read_name => (x,y,umi) #================================================================= self.logger.info("Parsing demultiplexed reads {}".format(globaltime.getTimestamp())) hash_reads = hashDemultiplexedReads(FILENAMES["demultiplexed_matched"], @@ -697,7 +698,7 @@ def run(self): self.low_memory) #================================================================ - # STEP: filters mapped reads and add the (Barcode,x,y,umi) as SAM tags + # STEP: filters mapped reads and add the (x,y,umi) as extra SAM tags #================================================================ self.logger.info("Starting processing aligned reads {}".format(globaltime.getTimestamp())) try: @@ -709,8 +710,13 @@ def run(self): except Exception: raise finally: - if self.low_memory: hash_reads.close() - + if self.low_memory: hash_reads.close() + else: + # Enforcing to remove the memory used + hash_reads.clear() + del hash_reads + gc.collect() + #================================================================= # STEP: annotate using htseq-count #=================================================================