-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathref_diff.py
More file actions
executable file
·129 lines (113 loc) · 5.56 KB
/
ref_diff.py
File metadata and controls
executable file
·129 lines (113 loc) · 5.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
"""
Name: ref_diff.py
Purpose: Use tophat and deseq to compare RNA-seq results from two reference
genome versions and create ready to use read count lists for
downstream analysis. The tool is mainly designed to compare data
from GRCh37/hg19 and GRCh38/hg38 but can be generalized with little
effort.
Input: -Previous bam alignment
-Previous bam unmapped reads
-Raw reads in fastq format
-old indexed reference (bowtie2/tophat index)
-old indexed transcript (bowtie2/tophat index)
-new indexed reference (bowtie2/tophat index)
-new indexed transcript (bowtie2/tophat index)
Output: -New reference alignment
-New reference unmapped bam
-Differential analysis between previous reference and new reference
-Read counts (input to DESeq) from hg38 alignment
Usage: ref_diff.py [OPTIONS]
"""
import os
import re
import sys
import argparse
from subprocess import call
def main(fq1, fq2, tran_index, genome_index, out_dir, threads, old_bam, old_trans):
#softlink old bam to results directory
os.symlink(old_bam, out_dir + "/previous.bam")
old_bam_link = out_dir + "/previous.bam"
#run tophat alignment with new bam and ref
new_bam = run_tophat(fq1, fq2, tran_index, genome_index, out_dir, threads)
#name sort old alignment
sorted_old = sort_bam(old_bam_link, threads)
#name sort new alignment
sorted_new = sort_bam(new_bam, threads)
#htseq-count old alignment
old_counts = run_htseqcount(sorted_old, old_trans)
#htseq-count new alignment
new_counts = run_htseqcount(sorted_new, tran_index+'.gtf')
#run intersect counts script
old_i,new_i = intersect_counts(old_counts, new_counts, old_trans, tran_index+'.gtf')
#run deseq script
deseq_result = run_deseq(old_i, new_i, out_dir)
#run annotation script
annotate_result(deseq_result, old_i, new_i, tran_index+'.gtf')
def run_tophat(fq1, fq2, transcript_index, genome_index, out_dir, threads):
check_result(call(["tophat",
"--read-mismatches=3",
"--read-edit-dist=3",
"--max-multihits=20",
"--splice-mismatches=1",
"--output-dir="+out_dir,
"--transcriptome-index="+transcript_index,
"--coverage-search",
"--mate-inner-dist=50",
"--microexon-search",
"--mate-std-dev=50",
"--num-threads="+str(threads),
genome_index,
fq1,
fq2]))
return out_dir+'/accepted_hits.bam'
def run_htseqcount(bam, gtf):
counts = os.path.splitext(bam)[0] + '.counts.txt'
count_fh = open(counts, 'w')
check_result(call(["htseq-count","-f","bam","-s","no","-r","name", bam, gtf], stdout=count_fh))
count_fh.close()
return counts
def run_deseq(old, new, out_dir):
deseq_script = os.path.dirname(os.path.abspath(__file__)) + '/deseq.R'
check_result(call(["Rscript",deseq_script,out_dir,old,new,"results"]))
return out_dir + '/DESeq2.results.csv'
def intersect_counts(old, new, o_i, n_i):
intersect_script = os.path.dirname(os.path.abspath(__file__)) + '/intersect_gene_lists.pl'
check_result(call([intersect_script,"-o",old,"-n",new,"-g",o_i,"-G",n_i]))
return old + '.intersect', new + '.intersect'
def annotate_result(deseq, old, new, gtf):
annotate_script = os.path.dirname(os.path.abspath(__file__)) + '/intersect_results_annotate.pl'
output = os.path.splitext(deseq)[0] + '.annotated.csv'
out_fh = open(output, 'w')
check_result(call([annotate_script,"-d",deseq,"-h",old,"-H",new,"-G",gtf], stdout=out_fh))
out_fh.close()
def sort_bam(bam, threads):
sort_prefix = os.path.splitext(bam)[0] + '.name'
check_result(call(["samtools","sort","-n","-@",str(threads),bam,sort_prefix]))
return sort_prefix+'.bam'
def verify_file(file):
if not os.path.isfile(file):
raise IOError("The File %s does not exist!" % file)
def check_result(ret_code):
if ret_code != 0:
raise RuntimeError("Subprocess failed %s" % ret_code)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description = "Use tophat and deseq to compare RNA-seq results from two reference genome versions and create ready to use read count lists for downstream analysis. The tool is mainly designed to compare data from GRCh37/hg19 and GRCh38/hg38 but can be generalized with little effort.")
parser.add_argument('output_dir', help='Output directory')
parser.add_argument('--oldbam', '-b', help='Previous bam alignment', required=True)
parser.add_argument('--fq1', '-1',help='Fastq formatted reads, read 1', required=True)
parser.add_argument('--fq2', '-2', help='Fastq formatted reads, read 2', required=True)
parser.add_argument('--oldgtf', '-t', help='old transcriptome gtf file', required=True)
parser.add_argument('--newref', '-R', help='new indexed reference', required=True)
parser.add_argument('--newtrans', '-T', help='new indexed transcript', required=True)
parser.add_argument('--threads', help='number of threads for multithreaded steps [%(default)s]', default=1)
args = parser.parse_args()
os.mkdir(args.output_dir)
verify_file(args.oldbam)
verify_file(args.fq1)
verify_file(args.fq2)
verify_file(args.oldgtf)
verify_file(args.newtrans+'.gtf')
main(args.fq1, args.fq2, args.newtrans, args.newref, args.output_dir,
args.threads, args.oldbam, args.oldgtf)