-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathConvertFASTAtoGTF.py
More file actions
92 lines (73 loc) · 3.4 KB
/
ConvertFASTAtoGTF.py
File metadata and controls
92 lines (73 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 12 10:43:35 2018
Converts FASTA sequences into GTF records
for use with the Cell Ranger pipeline.
@author: a.senabouth
"""
# Import argument-parsing modules
import sys, getopt
# Import Biopython modules
from Bio import SeqIO
def parseInput(input_file):
# Read in fasta file
fasta_records = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
fasta_lines = []
gtf_lines = []
# Template strings
fasta_header_template = ">{chromosome} dna:chromosome chromosome:GRCh38:{chromosome}:1:{length}:1 REF"
gtf_template = """{chromosome}\thavana\tgene\t1\t{length}\t.\t+\t.\tgene_id "{chromosome}"; gene_name "{chromosome}"; gene_source "ensembl_havana"; gene_biotype "protein_coding";\n{chromosome}\thavana\ttranscript\t1\t{length}\t.\t+\t.\tgene_id "{chromosome}"; transcript_id "{chromosome}"; gene_name "{chromosome}"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "{chromosome}"; transcript_source "havana";\n{chromosome}\thavana\texon\t1\t{length}\t.\t+\t.\tgene_id "{chromosome}"; transcript_id "{chromosome}"; exon_number "1"; gene_name "{chromosome}"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "{chromosome}"; transcript_source "havana"; exon_id "{chromosome}_exon";"""
# Iterate through record, create new record
for header, record in fasta_records.items():
seq = str(record.seq.lower())
seq_length = len(seq)
new_header = fasta_header_template.format(chromosome = header, length = seq_length)
new_gtf = gtf_template.format(chromosome = header, length = seq_length)
# Add to fasta_lines
fasta_lines.append(new_header)
fasta_lines.append(seq)
# Add to gtf_lines
gtf_lines.append(new_gtf)
return(fasta_lines, gtf_lines)
def main(argv):
# Put the variables here
input_file = None
output_filename = None
# Start checks
try:
opts, args = getopt.getopt(argv,"hi:o:",["ifile=", "ofile="])
except getopt.GetoptError:
print 'ConvertFASTAtoGTF.py -i <inputfile> -o <outputfilename>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'ConvertFASTAtoGTF.py -i <inputfile> -o <outputfilename>'
sys.exit()
elif opt in ("-i", "--ifile"):
input_file = arg
elif opt in ("-o", "--ofile"):
output_filename = arg
if input_file and output_filename:
return (input_file, output_filename)
else:
return None, None
if __name__ == "__main__":
# Call function to sort out arguments
input_file, output_filename = main(sys.argv[1:])
if not any([input_file, output_filename]):
print ("Please specify an input,fasta output and gtf output files.")
sys.exit(2)
# Call function to parse the input file
fasta_lines, gtf_lines = parseInput(input_file)
# Write processed data out
fasta_output = output_filename + ".fa"
gtf_output = output_filename + ".gtf"
# Add lines
fasta_lines = [line + "\n" for line in fasta_lines]
with open(fasta_output, "w") as fasta_handle:
fasta_handle.writelines(fasta_lines)
gtf_lines = [line + "\n" for line in gtf_lines]
with open(gtf_output, "w") as gtf_handle:
gtf_handle.writelines(gtf_lines)
print "Conversion complete!"