-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscripting.py
More file actions
127 lines (105 loc) · 5.68 KB
/
scripting.py
File metadata and controls
127 lines (105 loc) · 5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Used to script actions embedding sequencer.
import modules.cluster_mapping
import modules.embedding_generation
import modules.file_io
import time
import pandas as pd
import os
def run_pipeline(input_path, hdf_file = "", output_type = "f", print_flag = False, output_path = "fasta_output"):
'''Manages the scripting of actions for the embedding sequencer pipeline.
Parameters:
input_path: Input of the pipeline, takes FASTA, directory, or CSV/TSV
hdf_file: Clustering mapping information as an HDF file
output_type: Determines the output of the pipeline (f - FASTA, a - ALN, t - TSV file)
print_flag: If True, prints progress updates within the pipeline.
output_path: Determines the output folder
'''
if print_flag: print("Starting Embedding Sequencer Pipeline...\n")
# timer
start_time = round(time.perf_counter(), 4)
# parse input, build input list, and determine input/output
input_type = modules.file_io.determine_input(input_path) # 0 - FASTA, 1 - directory, 2 - CSV/TSV
if input_type == 0:
input_df = modules.file_io.read_fasta(input_path)
elif input_type == 1:
input_df = modules.file_io.read_directory_fastas(input_path)
elif input_type == 2:
input_df = modules.file_io.read_csv_tsv(input_path)
else:
raise ValueError("Input is not an acceptable format. Please enter a path for a FASTA, directory of FASTAs, or CSV/TSV.")
# check input and output_type
modules.file_io.validate_input_df(input_df) # checks df for columns and non-empty
if print_flag: print(f"Successfully inputted query of {len(input_df)} proteins.\n")
if print_flag: print(input_df + "\n")
# extract clustering information from HDF file and build FAISS index
aggregated_embeddings, saved_pca, cluster_labels, _, indicative_pattern, _, model_version = modules.cluster_mapping.unpack_hdf(hdf_file)
if print_flag: print(f"Shape of Embeddings: {aggregated_embeddings.shape}")
faiss_index = modules.cluster_mapping.build_faiss_index(aggregated_embeddings)
if print_flag: print(f"Unpacked clustering information from {hdf_file} and built FAISS index.")
# establish/download model
model, converter_or_tokenizer = modules.embedding_generation.download_model(version = model_version)
if print_flag: print("Finished downloading and setting up protein language model.\n")
# start output list
output_data = []
num_neighbors = 50
if print_flag: print("Generating Embeddings Sequences for Query Proteins...")
# LOOP
count = 0
spacing = len(str(len(input_df)))
skipped = []
for _, row in input_df.iterrows():
count += 1
entry, name, sequence = row["Entry"], row["Entry Name"], row["Sequence"]
if len(sequence) > 100000:
skipped.append(name)
if print_flag: print(f" {(count):>{spacing}}" + f" - {name} - Sequence too long, skipping.")
continue
# generate embeddings
query_embeddings = modules.embedding_generation.generate_embeddings(sequence, model, converter_or_tokenizer, version = model_version)
# apply PCA dimensionality reduction
query_embeddings = modules.cluster_mapping.apply_pca(query_embeddings, saved_pca)
# if print_flag: print(f" {name} - {query_embeddings.shape}")
# perform ANN with N neighbors
faiss_similarity, faiss_indices = modules.cluster_mapping.search_faiss_index(query_embeddings, faiss_index, num_neighbors = num_neighbors)
# determine cluster-label sequence
query_sequence, outlier_dict = modules.cluster_mapping.build_faiss_sequence(faiss_similarity, faiss_indices, cluster_labels, num_neighbors = num_neighbors)
# calculate outlier percentage
# sequence_confidence = modules.cluster_mapping.calculate_confidence(outlier_dict, faiss_similarity)
# if print_flag: print(f" {name} - {len(outlier_dict)} outliers with {sequence_confidence}% confidence.")
# find repeat locations
pattern_indexes = modules.embedding_generation.find_pattern(query_sequence, indicative_pattern)
if pattern_indexes: # check if pattern indexes if empty
start_pos, end_pos = pattern_indexes[0] + 1, pattern_indexes[-1] + 24
else:
start_pos, end_pos = -1, -1
# save to output list
data_entry = {
"Entry" : entry,
"Entry Name" : name,
"Sequence" : query_sequence,
# "Sequence Confidence" : sequence_confidence,
"Repeat Locations" : [x + 1 for x in pattern_indexes], # indexes to residue count
"Number of Repeats" : len(pattern_indexes),
"Start" : start_pos,
"End" : end_pos
}
output_data.append(data_entry)
if print_flag: print(f" {(count):>{spacing}}" + f" - {name} - {query_sequence[:50]}...")
output_df = pd.DataFrame(output_data)
if print_flag: print("Finished Generating Embeddings.\n")
# if print_flag: print(output_df)
# output
if output_type == "f": # FASTA
modules.file_io.write_fastas_to_directory(output_df, output_directory = f"{output_path}/")
elif output_type == "a": # ALN
modules.file_io.write_alns_to_directory(input_df, output_df, output_directory = f"{output_path}/")
elif output_type == "t": # TSV
modules.file_io.write_tsv(output_df, output_directory = f"{output_path}/")
else:
raise ValueError("Invalid output type entered.")
end_time = round(time.perf_counter(), 4)
runtime = round(end_time - start_time, 4)
if print_flag:
print(f"Program Completed. Exiting with Total Runtime of {runtime} seconds.")
print(f"Skipped: {skipped}")
return