Caeno_Scan/filter_og_structure.py at main · AndersenLab/Caeno_Scan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
import csv
import time

# For a given number of genes in an orthogroup, classify the structure
def classify_structure(n_sp):
    if n_sp == 0:
        c_sp = "none"
    if n_sp == 1:
        c_sp = "one"
    if n_sp > 1:
        c_sp = "many"
    return(c_sp)

# For a given orthogroup file, classify the structure of each orthogroup
# and return a dictionary of orthogroups by id and structure
def classify_orthogroups(sp1, sp2, sp3, og_file):
    orthogroups_dict = {}
    with open(og_file, 'r') as orthogroups:
        for line in orthogroups:
            og_id = line.rstrip("\n").split(": ")[0]
            seqs = line.rstrip("\n").split(": ")[1].split(" ")
            #Count the number of genes for each species
            n_sp1 = 0
            n_sp2 = 0
            n_sp3 = 0
            for seq in seqs:
                if seq.startswith(sp1):
                    n_sp1 = n_sp1 + 1
                if seq.startswith(sp2):
                    n_sp2 = n_sp2 + 1
                if seq.startswith(sp3):
                    n_sp3 = n_sp3 + 1
            #Classify structure
            c_sp1 = classify_structure(n_sp1)
            c_sp2 = classify_structure(n_sp2)
            c_sp3 = classify_structure(n_sp3)
            og_struct = c_sp1 + ":" + c_sp2 + ":" + c_sp3
            orthogroups_dict[og_id] = {"id":og_id, sp1:n_sp1, sp2:n_sp2, sp3:n_sp3, "structure":og_struct}
    return(orthogroups_dict)

def filter_orthogroups(og_dict, structure):
    #Filters the orthogroups by structure (e.g. "one:none:none")
    filtered_ogs = {}
    for og in og_dict.values():
        if og["structure"] == structure:
            filtered_ogs[og["id"]] = og
    return(filtered_ogs)

if __name__ == "__main__":

    #Define the path to the orthogroups file
    orthogroups = "input_data/all_species/orthogroups/02.21.22_orthogroups/Orthogroups.txt"

    #Define the transcript prefixes for each species used in GFF
    sp1 = "Transcript" #elegans
    sp2 = "QX1410"
    sp3 = "transcript_CTROP"

    #Classify the orthogroups
    og_dicts = classify_orthogroups(sp1, sp2, sp3, orthogroups)

    #Filter the orthogroups by structure
    structure = "one:one:one"

    filtered_ogs = filter_orthogroups(og_dicts, structure)

    #Convert the dictionary to a csv file
    #create a file name with today's date
    ## save todays date as a string
    today = time.strftime("%Y%m%d-%H%M%S")

    file_name = today + "_orthogroup_structures" + ".csv"
    col_name=["id", sp1, sp2, sp3, "structure"]
    with open(file_name, 'w') as csvFile: #Change the name of the output file here
            wr = csv.DictWriter(csvFile, fieldnames=col_name)
            wr.writeheader()
            for ele in og_dicts.values():
                wr.writerow(ele)

    #Save the filtered orthogroups to a file
    file_name = today + "_filtered_orthogroups" + ".csv"
    col_name=["id", sp1, sp2, sp3, "structure"]
    with open(file_name, 'w') as csvFile: #Change the name of the output file here
            wr = csv.DictWriter(csvFile, fieldnames=col_name)
            wr.writeheader()
            for ele in filtered_ogs.values():
                wr.writerow(ele)

    # Save the filtered orthogroups to a list of IDs
    file_name = today + "_filtered_orthogroup_ids" + ".txt"
    with open(file_name, 'w') as txtFile: #Change the name of the output file here
        for ele in filtered_ogs.values():
            txtFile.write(ele["id"] + "\n")