Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 156 additions & 70 deletions README.md

Large diffs are not rendered by default.

175 changes: 128 additions & 47 deletions reportree.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions scripts/alignment_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import datetime as datetime
from Bio import SeqIO, AlignIO, Align, Alphabet

version = "1.5.1"
last_updated = "2024-09-27"
version = "1.5.2"
last_updated = "2024-10-20"

# functions ----------

Expand Down
86 changes: 59 additions & 27 deletions scripts/metadata_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from pandas.api.types import is_datetime64_any_dtype as is_datetime
import datetime

version = "1.2.0"
last_updated = "2024-11-06"
version = "1.4.0"
last_updated = "2025-10-22"

# functions ----------

def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2report, filters, log):
def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2report, filters, update_clusters, log):
"""
create a combined matrix with metadata and partitions information

Expand Down Expand Up @@ -54,10 +54,9 @@ def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2rep
mx_metadata.insert(index_no, "year_original", year_original)
index_no = mx_metadata.columns.get_loc("date")
mx_metadata["date"] = pandas.to_datetime(mx_metadata["date"], errors = "coerce")
mx_metadata["year"] = mx_metadata["date"].dt.year
mx_metadata["year"] = mx_metadata["date"].dt.year.astype("Int64")
year = mx_metadata.pop("year")
mx_metadata.insert(index_no + 1, "year", year)
index_no = mx_metadata.columns.get_loc("date")
if "iso_week_nr" not in mx_metadata.columns and "iso_year" not in mx_metadata.columns and "iso_week" not in mx_metadata.columns:
isoyear = mx_metadata["date"].dt.isocalendar().year
isoweek = mx_metadata["date"].dt.isocalendar().week
Expand Down Expand Up @@ -93,28 +92,51 @@ def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2rep
a = mx_metadata.set_index(sample_column, drop = True)
b = mx_partitions.set_index(sample_column_part, drop = True)

if len(set(a.columns) & set(b.columns)) > 0:
b = b.add_prefix("subset_")
possible_subset = True
mx_partitions.columns = [sample_column_part] + b.columns.tolist()
mx_partitions.to_csv(partitions_name, index = False, header=True, sep ="\t")
if partitions2report == "all": # add all partitions
c = pandas.concat([a, b], axis=1)

else: # add specific set of partitions
required_partitions = partitions2report.split(",")
c = a
for column_name in required_partitions:
if column_name in b.columns:
c = pandas.concat([c,b[column_name]], axis = 1)
else:
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file = log)

if not update_clusters:
if len(set(a.columns) & set(b.columns)) > 0:
b = b.add_prefix("subset_")
possible_subset = True
mx_partitions.columns = [sample_column_part] + b.columns.tolist()
mx_partitions.to_csv(partitions_name, index = False, header=True, sep ="\t")
if partitions2report == "all": # add all partitions
c = pandas.concat([a, b], axis=1)
else: # add specific set of partitions
required_partitions = partitions2report.split(",")
c = a
for column_name in required_partitions:
if column_name in b.columns:
c = pandas.concat([c,b[column_name]], axis = 1)
else:
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file = log)
else:
shared_columns = list(set(a.columns) & set(b.columns))
new_columns = b.columns.difference(a.columns)
b_shared = b[shared_columns]
common_samples = a.index.intersection(b.index)
new_samples = b.index.difference(a.index)

a.loc[common_samples, shared_columns] = b_shared.loc[common_samples]
new_rows = b_shared.loc[new_samples]
a = pandas.concat([a, new_rows], axis=0)

c = a.copy()

if partitions2report == "all": # add all partitions
valid_cols = b.columns.intersection(new_columns)
c = pandas.concat([c, b.loc[b.index.intersection(c.index), valid_cols].reindex(c.index)], axis=1)
else:
required_partitions = partitions2report.split(",")
for column_name in required_partitions:
if column_name in b.columns and column_name not in shared_columns:
valid_rows = b.index.intersection(c.index)
c = pandas.concat([c, b.loc[valid_rows, [column_name]].reindex(c.index)], axis=1)
else:
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file=log)
else:
c = mx_metadata.set_index(mx_metadata.columns[0], drop = True)


# filtering table according to specifications
c_filtered = c.reset_index(drop = False)
c_filtered.rename(columns={c_filtered.columns[0]: mx_metadata.columns[0]}, inplace=True)
Expand Down Expand Up @@ -225,7 +247,8 @@ def partitions_summary(complete_metadata, partitions, partitions2report, summary
summary = {"partition": [], "cluster": [], "cluster_length": [], "samples": []} # dictionary for final dataframe
order_columns = ["partition", "cluster", "cluster_length", "samples"] # list with the order of columns in final dataframe

complete_metadata = complete_metadata.fillna("EMPTY")
str_cols = complete_metadata.select_dtypes(include=["object", "string"]).columns
complete_metadata[str_cols] = complete_metadata[str_cols].fillna("EMPTY")

if partitions2report == "all":
if isinstance(partitions, pandas.DataFrame):
Expand Down Expand Up @@ -335,7 +358,8 @@ def col_summary(main_column, complete_metadata, columns_summary_report, sample_c

absent_columns = []

complete_metadata = complete_metadata.fillna("EMPTY")
str_cols = complete_metadata.select_dtypes(include=["object", "string"]).columns
complete_metadata[str_cols] = complete_metadata[str_cols].fillna("EMPTY")
if main_column in complete_metadata.columns:
order_columns = [main_column]
summary = {main_column: []} # dictionary for final dataframe
Expand Down Expand Up @@ -707,6 +731,9 @@ def main():
they must be separated with commas (e.g 'country == Portugal,Spain,France'). When filters include more than one column, they must be separated with semicolon (e.g. \
'country == Portugal,Spain,France;date > 2018-01-01;date < 2022-01-01'). White spaces are important in this argument, so, do not leave spaces before and after \
commas/semicolons.")
parser.add_argument("--update-cluster-names", dest="update_clusters", required=False, action="store_true", help="[OPTIONAL] Activate update clusters mode. If you set this argument, and the metadata \
you provided has columns names that correspond to partition thresholds from a previous ReporTree run using the same analysis method as in the current run, we will provide updated \
cluster names in the *metadata_w_partitions.tsv, instead of adding new columns with the prefix subset. This argument is only applicable if you provide a partitions file.")
parser.add_argument("--frequency-matrix", dest="frequency_matrix", required=False, default="no", help="[OPTIONAL] Metadata column names for which a frequency matrix will be generated. This must \
be specified within quotation marks in the following format 'variable1,variable2'. Variable1 is the variable for which frequencies will be calculated (e.g. for \
'lineage,iso_week' the matrix reports the percentage of samples that correspond to each lineage per iso_week). If you want more than one matrix you can separate the \
Expand Down Expand Up @@ -744,14 +771,19 @@ def main():

# analysis of the partitions file

update_clusters = False
if args.partitions != "": # partitions table provided
print("Getting information from the partitions table: " + str(args.partitions))
print("Getting information from the partitions table: " + str(args.partitions), file = log)
partitions_name = args.partitions
partitions = pandas.read_table(args.partitions, dtype = str)
if args.update_clusters:
update_clusters = True
else:
partitions = args.partitions
partitions_name = ""
if args.update_clusters:
print("\nYou requested to update cluster names in metadata_w_partitions.tsv, but no partitions file was provided. So, I cannot do it :-(")


# adding partitions to metadata
Expand All @@ -764,7 +796,7 @@ def main():
if " " in col:
new_name = col.replace(" ", "_")
col_rename[new_name] = col
complete_metadata, possible_subset = partitions2metadata(partitions_name, partitions, mx_metadata, args.partitions2report, args.filter_column, log)
complete_metadata, possible_subset = partitions2metadata(partitions_name, partitions, mx_metadata, args.partitions2report, args.filter_column, update_clusters, log)
sample_column = complete_metadata.columns[0]


Expand Down
61 changes: 29 additions & 32 deletions scripts/partitioning_HC.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

sys.setrecursionlimit(10000) # please increase this number, if you are getting the error "RecursionError: maximum recursion depth exceeded while calling a Python object"

version = "1.8.0"
last_updated = "2024-09-09"
version = "1.9.0"
last_updated = "2025-10-21"

# functions ----------

Expand Down Expand Up @@ -161,15 +161,9 @@ def filter_mx(matrix, mx, filters, matrix_type, log):
samples = mx[sample_column].tolist()
if matrix_type == "dist":
pairwise_dist_mx = matrix.set_index(matrix.columns[0], drop = True)
columns_interest = []

for sample in pairwise_dist_mx.columns:
if sample in samples:
columns_interest.append(sample)
columns_interest = [col for col in pairwise_dist_mx.columns if col in samples]
df = pairwise_dist_mx.loc[columns_interest, columns_interest].reset_index()

df = pairwise_dist_mx[columns_interest].loc[columns_interest]
df = df.reset_index(drop=False)

else:
df = matrix[matrix[matrix.columns[0]].isin(samples)]

Expand Down Expand Up @@ -410,17 +404,17 @@ def main():
if float(args.samples_called) == 0.0:
print("Keeping the sites/loci present in the loci list.")
print("Keeping the sites/loci present in the loci list.", file = log)
for col in allele_mx.columns[1:]:
if col not in loci2include:
allele_mx = allele_mx.drop(columns=col)
cols_to_keep = [allele_mx.columns[0]] + [col for col in allele_mx.columns[1:] if col in loci2include]
allele_mx = allele_mx[cols_to_keep]
else:
print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
for col in allele_mx.columns[1:]:
if col not in loci2include:
values = allele_mx[col].values.tolist()
if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
allele_mx = allele_mx.drop(columns=col)
cols_to_check = [col for col in allele_mx.columns[1:] if col not in loci2include]
subset = allele_mx[cols_to_check]
called_proportion = (subset != "0").sum() / len(subset)
columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
final_cols = [col for col in allele_mx.columns if col == allele_mx.columns[0] or col in columns_to_keep or col in loci2include]
allele_mx = allele_mx[final_cols]
pos_t1 = len(allele_mx.columns[1:])
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
Expand All @@ -429,10 +423,12 @@ def main():
if float(args.samples_called) > 0.0:
print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
for col in allele_mx.columns[1:]:
values = allele_mx[col].values.tolist()
if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
allele_mx = allele_mx.drop(columns=col)
cols_to_check = allele_mx.columns[1:]
subset = allele_mx[cols_to_check]
called_proportion = (subset != "0").sum(axis=0) / len(subset)
columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
final_cols = [allele_mx.columns[0]] + [col for col in cols_to_check if col in columns_to_keep]
allele_mx = allele_mx[final_cols]
pos_t1 = len(allele_mx.columns[1:])
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
Expand All @@ -448,24 +444,25 @@ def main():

# cleaning allele matrix (rows) ----------

if float(args.loci_called) > 0.0 and float(args.samples_called) < 1.0:
if float(args.loci_called) >= 0.0:
print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called per sample...")
print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called per sample...", file = log)

report_allele_mx = {}

len_schema = len(allele_mx.columns) - 1

missing_counts = allele_mx.isin(["0"]).sum(axis=1)
called_counts = len_schema - missing_counts

report_allele_mx = {}
report_allele_mx["samples"] = allele_mx[allele_mx.columns[0]]
report_allele_mx["missing"] = allele_mx.isin(["0"]).sum(axis=1)
report_allele_mx["called"] = len_schema - allele_mx.isin(["0"]).sum(axis=1)
report_allele_mx["pct_called"] = (len_schema - allele_mx.isin(["0"]).sum(axis=1)) / len_schema
report_allele_mx["missing"] = missing_counts
report_allele_mx["called"] = called_counts
report_allele_mx["pct_called"] = called_counts / len_schema

report_allele_df = pandas.DataFrame(data = report_allele_mx)
if float(args.loci_called) != 1.0:
flt_report = report_allele_df[report_allele_df["pct_called"] > float(args.loci_called)]
else:
flt_report = report_allele_df[report_allele_df["pct_called"] == float(args.loci_called)]

flt_report = report_allele_df[report_allele_df["pct_called"] >= float(args.loci_called)]

pass_samples = flt_report["samples"].values.tolist()
report_allele_df.to_csv(args.out + "_loci_report.tsv", index = False, header=True, sep ="\t")

Expand Down
58 changes: 29 additions & 29 deletions scripts/partitioning_grapetree.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
grapetree = partitioning_grapetree_script.rsplit("/", 1)[0] + "/GrapeTree/grapetree.py"
python = sys.executable

version = "1.5.0"
last_updated = "2024-07-12"
version = "1.6.0"
last_updated = "2025-10-21"

# additional functions ----------

Expand Down Expand Up @@ -335,19 +335,18 @@ def main():
if float(args.samples_called) == 0.0:
print("Keeping the sites/loci present in the loci list.")
print("Keeping the sites/loci present in the loci list.", file = log)
for col in mx_allele.columns[1:]:
if col not in loci2include:
mx_allele = mx_allele.drop(columns=col)
cols_to_keep = [mx_allele.columns[0]] + [col for col in mx_allele.columns[1:] if col in loci2include]
mx_allele = mx_allele[cols_to_keep]
else:
print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
for col in mx_allele.columns[1:]:
if col not in loci2include:
values = mx_allele[col].values.tolist()
if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
mx_allele = mx_allele.drop(columns=col)
elif (len(values)-values.count(0))/len(values) < float(args.samples_called):
mx_allele = mx_allele.drop(columns=col)
cols_to_check = [col for col in mx_allele.columns[1:] if col not in loci2include]
subset = mx_allele[cols_to_check]
called_proportion = (subset != "0").sum() / len(subset)
columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
final_cols = [col for col in mx_allele.columns if col == mx_allele.columns[0] or col in columns_to_keep or col in loci2include]
mx_allele = mx_allele[final_cols]

pos_t1 = len(mx_allele.columns[1:])
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
Expand All @@ -356,12 +355,12 @@ def main():
if float(args.samples_called) > 0.0:
print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
for col in mx_allele.columns[1:]:
values = mx_allele[col].values.tolist()
if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
mx_allele = mx_allele.drop(columns=col)
elif (len(values)-values.count(0))/len(values) < float(args.samples_called):
mx_allele = mx_allele.drop(columns=col)
cols_to_check = mx_allele.columns[1:]
subset = mx_allele[cols_to_check]
called_proportion = (subset != "0").sum(axis=0) / len(subset)
columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
final_cols = [mx_allele.columns[0]] + [col for col in cols_to_check if col in columns_to_keep]
mx_allele = mx_allele[final_cols]
pos_t1 = len(mx_allele.columns[1:])
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
Expand All @@ -380,25 +379,26 @@ def main():

# cleaning allele matrix (rows) ----------

if float(args.loci_called) > 0.0:
if float(args.loci_called) >= 0.0:
print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called...")
print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called...", file = log)

allele_mx = pandas.read_table(allele_filename, dtype = str)
report_allele_mx = {}

len_schema = len(allele_mx.columns) - 1


missing_counts = allele_mx.isin(["0"]).sum(axis=1)
called_counts = len_schema - missing_counts

report_allele_mx = {}
report_allele_mx["samples"] = allele_mx[allele_mx.columns[0]]
report_allele_mx["missing"] = allele_mx.isin(["0"]).sum(axis=1)
report_allele_mx["called"] = len_schema - allele_mx.isin(["0"]).sum(axis=1)
report_allele_mx["pct_called"] = (len_schema - allele_mx.isin(["0"]).sum(axis=1)) / len_schema
report_allele_mx["missing"] = missing_counts
report_allele_mx["called"] = called_counts
report_allele_mx["pct_called"] = called_counts / len_schema

report_allele_df = pandas.DataFrame(data = report_allele_mx)
if float(args.loci_called) != 1.0:
flt_report = report_allele_df[report_allele_df["pct_called"] > float(args.loci_called)]
else:
flt_report = report_allele_df[report_allele_df["pct_called"] == float(args.loci_called)]

flt_report = report_allele_df[report_allele_df["pct_called"] >= float(args.loci_called)]

pass_samples = flt_report["samples"].values.tolist()

print("\tFrom the " + str(len(allele_mx[allele_mx.columns[0]].values.tolist())) + " samples, " + str(len(pass_samples)) + " were kept in the profile matrix.")
Expand Down
Loading