insapathogenomics · vmixao · Oct 22, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/README.md b/README.md
diff --git a/reportree.py b/reportree.py
diff --git a/scripts/alignment_processing.py b/scripts/alignment_processing.py
@@ -16,8 +16,8 @@
 import datetime as datetime
 from Bio import SeqIO, AlignIO, Align, Alphabet
 
-version = "1.5.1"
-last_updated = "2024-09-27"
+version = "1.5.2"
+last_updated = "2024-10-20"
 
 # functions	----------
 

diff --git a/scripts/metadata_report.py b/scripts/metadata_report.py
@@ -15,12 +15,12 @@
 from pandas.api.types import is_datetime64_any_dtype as is_datetime
 import datetime
 
-version = "1.2.0"
-last_updated = "2024-11-06"
+version = "1.4.0"
+last_updated = "2025-10-22"
 
 # functions	----------
 
-def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2report, filters, log):
+def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2report, filters, update_clusters, log):
 	""" 
 	create a combined matrix with metadata and partitions information
 
@@ -54,10 +54,9 @@ def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2rep
 			mx_metadata.insert(index_no, "year_original", year_original)
 			index_no = mx_metadata.columns.get_loc("date")
 		mx_metadata["date"] = pandas.to_datetime(mx_metadata["date"], errors = "coerce")
-		mx_metadata["year"] = mx_metadata["date"].dt.year
+		mx_metadata["year"] = mx_metadata["date"].dt.year.astype("Int64")
 		year = mx_metadata.pop("year")
 		mx_metadata.insert(index_no + 1, "year", year)
-		index_no = mx_metadata.columns.get_loc("date")
 		if "iso_week_nr" not in mx_metadata.columns and "iso_year" not in mx_metadata.columns and "iso_week" not in mx_metadata.columns:
 			isoyear = mx_metadata["date"].dt.isocalendar().year
 			isoweek = mx_metadata["date"].dt.isocalendar().week
@@ -93,28 +92,51 @@ def partitions2metadata(partitions_name, partitions, mx_metadata, partitions2rep
 		a = mx_metadata.set_index(sample_column, drop = True)
 		b = mx_partitions.set_index(sample_column_part, drop = True)
 
-		if len(set(a.columns) & set(b.columns)) > 0:
-			b = b.add_prefix("subset_")
-			possible_subset = True
-			mx_partitions.columns = [sample_column_part] + b.columns.tolist()
-			mx_partitions.to_csv(partitions_name, index = False, header=True, sep ="\t")
-		if partitions2report == "all": # add all partitions
-			c = pandas.concat([a, b], axis=1)
-
-		else: # add specific set of partitions
-			required_partitions = partitions2report.split(",")
-			c = a
-			for column_name in required_partitions:
-				if column_name in b.columns:
-					c = pandas.concat([c,b[column_name]], axis = 1)
-				else:
-					print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
-					print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file = log)
-
+		if not update_clusters:
+			if len(set(a.columns) & set(b.columns)) > 0:
+				b = b.add_prefix("subset_")
+				possible_subset = True
+				mx_partitions.columns = [sample_column_part] + b.columns.tolist()
+				mx_partitions.to_csv(partitions_name, index = False, header=True, sep ="\t")
+			if partitions2report == "all": # add all partitions
+				c = pandas.concat([a, b], axis=1)
+			else: # add specific set of partitions
+				required_partitions = partitions2report.split(",")
+				c = a
+				for column_name in required_partitions:
+					if column_name in b.columns:
+						c = pandas.concat([c,b[column_name]], axis = 1)
+					else:
+						print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
+						print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file = log)
+		else:
+			shared_columns = list(set(a.columns) & set(b.columns))
+			new_columns = b.columns.difference(a.columns)
+			b_shared = b[shared_columns]
+			common_samples = a.index.intersection(b.index)
+			new_samples = b.index.difference(a.index)
+
+			a.loc[common_samples, shared_columns] = b_shared.loc[common_samples]
+			new_rows = b_shared.loc[new_samples]
+			a = pandas.concat([a, new_rows], axis=0)
+
+			c = a.copy()
+
+			if partitions2report == "all":  # add all partitions
+				valid_cols = b.columns.intersection(new_columns)
+				c = pandas.concat([c, b.loc[b.index.intersection(c.index), valid_cols].reindex(c.index)], axis=1)
+			else:
+				required_partitions = partitions2report.split(",")
+				for column_name in required_partitions:
+					if column_name in b.columns and column_name not in shared_columns:
+						valid_rows = b.index.intersection(c.index)
+						c = pandas.concat([c, b.loc[valid_rows, [column_name]].reindex(c.index)], axis=1)
+					else:
+						print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!")
+						print("\t\t" + column_name + " will not be reported because it was not found in the partitions table!!", file=log)	
 	else:
 		c = mx_metadata.set_index(mx_metadata.columns[0], drop = True)
 
-
 	# filtering table according to specifications
 	c_filtered = c.reset_index(drop = False)
 	c_filtered.rename(columns={c_filtered.columns[0]: mx_metadata.columns[0]}, inplace=True)
@@ -225,7 +247,8 @@ def partitions_summary(complete_metadata, partitions, partitions2report, summary
 	summary = {"partition": [], "cluster": [], "cluster_length": [], "samples": []} # dictionary for final dataframe
 	order_columns = ["partition", "cluster", "cluster_length", "samples"] # list with the order of columns in final dataframe
 
-	complete_metadata = complete_metadata.fillna("EMPTY")
+	str_cols = complete_metadata.select_dtypes(include=["object", "string"]).columns
+	complete_metadata[str_cols] = complete_metadata[str_cols].fillna("EMPTY")
 
 	if partitions2report == "all":
 		if isinstance(partitions, pandas.DataFrame):
@@ -335,7 +358,8 @@ def col_summary(main_column, complete_metadata, columns_summary_report, sample_c
 
 	absent_columns = []
 
-	complete_metadata = complete_metadata.fillna("EMPTY")
+	str_cols = complete_metadata.select_dtypes(include=["object", "string"]).columns
+	complete_metadata[str_cols] = complete_metadata[str_cols].fillna("EMPTY")
 	if main_column in complete_metadata.columns:
 		order_columns = [main_column]
 		summary = {main_column: []} # dictionary for final dataframe
@@ -707,6 +731,9 @@ def main():
 						they must be separated with commas (e.g 'country == Portugal,Spain,France'). When filters include more than one column, they must be separated with semicolon (e.g. \
 						'country == Portugal,Spain,France;date > 2018-01-01;date < 2022-01-01'). White spaces are important in this argument, so, do not leave spaces before and after \
 						commas/semicolons.")
+	parser.add_argument("--update-cluster-names", dest="update_clusters", required=False, action="store_true", help="[OPTIONAL] Activate update clusters mode. If you set this argument, and the metadata \
+					 	you provided has columns names that correspond to partition thresholds from a previous ReporTree run using the same analysis method as in the current run, we will provide updated \
+						cluster names in the *metadata_w_partitions.tsv, instead of adding new columns with the prefix subset. This argument is only applicable if you provide a partitions file.")
 	parser.add_argument("--frequency-matrix", dest="frequency_matrix", required=False, default="no", help="[OPTIONAL] Metadata column names for which a frequency matrix will be generated. This must \
 						be specified within quotation marks in the following format 'variable1,variable2'. Variable1 is the variable for which frequencies will be calculated (e.g. for \
 						'lineage,iso_week' the matrix reports the percentage of samples that correspond to each lineage per iso_week). If you want more than one matrix you can separate the \
@@ -744,14 +771,19 @@ def main():
 
 	# analysis of the partitions file
 
+	update_clusters = False
 	if args.partitions != "": # partitions table provided
 		print("Getting information from the partitions table: " + str(args.partitions))
 		print("Getting information from the partitions table: " + str(args.partitions), file = log)
 		partitions_name = args.partitions
 		partitions = pandas.read_table(args.partitions, dtype = str)
+		if args.update_clusters:
+			update_clusters = True
 	else:
 		partitions = args.partitions
 		partitions_name = ""
+		if args.update_clusters:
+			print("\nYou requested to update cluster names in metadata_w_partitions.tsv, but no partitions file was provided. So, I cannot do it :-(")
 
 
 	# adding partitions to metadata
@@ -764,7 +796,7 @@ def main():
 		if " " in col:
 			new_name = col.replace(" ", "_")
 			col_rename[new_name] = col
-	complete_metadata, possible_subset = partitions2metadata(partitions_name, partitions, mx_metadata, args.partitions2report, args.filter_column, log)
+	complete_metadata, possible_subset = partitions2metadata(partitions_name, partitions, mx_metadata, args.partitions2report, args.filter_column, update_clusters, log)
 	sample_column = complete_metadata.columns[0]
 
 

diff --git a/scripts/partitioning_HC.py b/scripts/partitioning_HC.py
@@ -20,8 +20,8 @@
 
 sys.setrecursionlimit(10000) # please increase this number, if you are getting the error "RecursionError: maximum recursion depth exceeded while calling a Python object" 
 
-version = "1.8.0"
-last_updated = "2024-09-09"
+version = "1.9.0"
+last_updated = "2025-10-21"
 
 # functions	----------
 
@@ -161,15 +161,9 @@ def filter_mx(matrix, mx, filters, matrix_type, log):
 	samples = mx[sample_column].tolist()
 	if matrix_type == "dist":
 		pairwise_dist_mx = matrix.set_index(matrix.columns[0], drop = True)
-		columns_interest = []
-
-		for sample in pairwise_dist_mx.columns:
-			if sample in samples:
-				columns_interest.append(sample)
+		columns_interest = [col for col in pairwise_dist_mx.columns if col in samples]
+		df = pairwise_dist_mx.loc[columns_interest, columns_interest].reset_index()
 
-		df = pairwise_dist_mx[columns_interest].loc[columns_interest]
-		df = df.reset_index(drop=False)
-
 	else:
 		df = matrix[matrix[matrix.columns[0]].isin(samples)]
 
@@ -410,17 +404,17 @@ def main():
 			if float(args.samples_called) == 0.0:	
 				print("Keeping the sites/loci present in the loci list.")
 				print("Keeping the sites/loci present in the loci list.", file = log)			
-				for col in allele_mx.columns[1:]:
-					if col not in loci2include:
-						allele_mx = allele_mx.drop(columns=col)
+				cols_to_keep = [allele_mx.columns[0]] + [col for col in allele_mx.columns[1:] if col in loci2include]
+				allele_mx = allele_mx[cols_to_keep]
 			else:
 				print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
 				print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
-				for col in allele_mx.columns[1:]:
-					if col not in loci2include:
-						values = allele_mx[col].values.tolist()
-						if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
-							allele_mx = allele_mx.drop(columns=col)
+				cols_to_check = [col for col in allele_mx.columns[1:] if col not in loci2include]
+				subset = allele_mx[cols_to_check]
+				called_proportion = (subset != "0").sum() / len(subset)
+				columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
+				final_cols = [col for col in allele_mx.columns if col == allele_mx.columns[0] or col in columns_to_keep or col in loci2include]
+				allele_mx = allele_mx[final_cols]
 			pos_t1 = len(allele_mx.columns[1:])
 			print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
 			print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
@@ -429,10 +423,12 @@ def main():
 			if float(args.samples_called) > 0.0:
 				print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
 				print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
-				for col in allele_mx.columns[1:]:
-					values = allele_mx[col].values.tolist()
-					if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
-						allele_mx = allele_mx.drop(columns=col)
+				cols_to_check = allele_mx.columns[1:]
+				subset = allele_mx[cols_to_check]
+				called_proportion = (subset != "0").sum(axis=0) / len(subset)
+				columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
+				final_cols = [allele_mx.columns[0]] + [col for col in cols_to_check if col in columns_to_keep]
+				allele_mx = allele_mx[final_cols]
 				pos_t1 = len(allele_mx.columns[1:])
 				print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
 				print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
@@ -448,24 +444,25 @@ def main():
 
 		# cleaning allele matrix (rows)	----------
 
-		if float(args.loci_called) > 0.0 and float(args.samples_called) < 1.0:
+		if float(args.loci_called) >= 0.0:
 			print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called per sample...")
 			print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called per sample...", file = log)
 
-			report_allele_mx = {}
-
 			len_schema = len(allele_mx.columns) - 1
+
+			missing_counts = allele_mx.isin(["0"]).sum(axis=1)
+			called_counts = len_schema - missing_counts
 
+			report_allele_mx = {}		
 			report_allele_mx["samples"] = allele_mx[allele_mx.columns[0]]
-			report_allele_mx["missing"] = allele_mx.isin(["0"]).sum(axis=1)
-			report_allele_mx["called"] = len_schema - allele_mx.isin(["0"]).sum(axis=1)
-			report_allele_mx["pct_called"] = (len_schema - allele_mx.isin(["0"]).sum(axis=1)) / len_schema
+			report_allele_mx["missing"] = missing_counts
+			report_allele_mx["called"] = called_counts
+			report_allele_mx["pct_called"] = called_counts / len_schema
 
 			report_allele_df = pandas.DataFrame(data = report_allele_mx)
-			if float(args.loci_called) != 1.0:
-				flt_report = report_allele_df[report_allele_df["pct_called"] > float(args.loci_called)]
-			else:
-				flt_report = report_allele_df[report_allele_df["pct_called"] == float(args.loci_called)]
+
+			flt_report = report_allele_df[report_allele_df["pct_called"] >= float(args.loci_called)]
+
 			pass_samples = flt_report["samples"].values.tolist()
 			report_allele_df.to_csv(args.out + "_loci_report.tsv", index = False, header=True, sep ="\t")
 

diff --git a/scripts/partitioning_grapetree.py b/scripts/partitioning_grapetree.py
@@ -25,8 +25,8 @@
 grapetree = partitioning_grapetree_script.rsplit("/", 1)[0] + "/GrapeTree/grapetree.py"
 python = sys.executable
 
-version = "1.5.0"
-last_updated = "2024-07-12"
+version = "1.6.0"
+last_updated = "2025-10-21"
 
 # additional functions	----------
 
@@ -335,19 +335,18 @@ def main():
 		if float(args.samples_called) == 0.0:
 			print("Keeping the sites/loci present in the loci list.")
 			print("Keeping the sites/loci present in the loci list.", file = log)
-			for col in mx_allele.columns[1:]:
-				if col not in loci2include:
-					mx_allele = mx_allele.drop(columns=col)
+			cols_to_keep = [mx_allele.columns[0]] + [col for col in mx_allele.columns[1:] if col in loci2include]
+			mx_allele = mx_allele[cols_to_keep]
 		else:
 			print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
 			print("Keeping the sites/loci present in the loci list and those with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
-			for col in mx_allele.columns[1:]:
-				if col not in loci2include:
-					values = mx_allele[col].values.tolist()
-					if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
-						mx_allele = mx_allele.drop(columns=col)
-					elif (len(values)-values.count(0))/len(values) < float(args.samples_called):
-						mx_allele = mx_allele.drop(columns=col)
+			cols_to_check = [col for col in mx_allele.columns[1:] if col not in loci2include]
+			subset = mx_allele[cols_to_check]
+			called_proportion = (subset != "0").sum() / len(subset)
+			columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
+			final_cols = [col for col in mx_allele.columns if col == mx_allele.columns[0] or col in columns_to_keep or col in loci2include]
+			mx_allele = mx_allele[final_cols]
+
 		pos_t1 = len(mx_allele.columns[1:])
 		print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
 		print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
@@ -356,12 +355,12 @@ def main():
 		if float(args.samples_called) > 0.0:
 			print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...")
 			print("Keeping the sites/loci with information in >= " + str(float(args.samples_called) * 100) + "%% of the samples...", file = log)
-			for col in mx_allele.columns[1:]:
-				values = mx_allele[col].values.tolist()
-				if (len(values)-values.count("0"))/len(values) < float(args.samples_called):
-					mx_allele = mx_allele.drop(columns=col)
-				elif (len(values)-values.count(0))/len(values) < float(args.samples_called):
-					mx_allele = mx_allele.drop(columns=col)	
+			cols_to_check = mx_allele.columns[1:]
+			subset = mx_allele[cols_to_check]
+			called_proportion = (subset != "0").sum(axis=0) / len(subset)
+			columns_to_keep = called_proportion[called_proportion >= float(args.samples_called)].index.tolist()
+			final_cols = [mx_allele.columns[0]] + [col for col in cols_to_check if col in columns_to_keep]
+			mx_allele = mx_allele[final_cols]
 			pos_t1 = len(mx_allele.columns[1:])
 			print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.")
 			print("\tFrom the " + str(pos_t0) + " loci/positions, " + str(pos_t1) + " were kept in the matrix.", file = log)
@@ -380,25 +379,26 @@ def main():
 
 	# cleaning allele matrix (rows)	----------
 
-	if float(args.loci_called) > 0.0:
+	if float(args.loci_called) >= 0.0:
 		print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called...")
 		print("Cleaning the profile matrix using a threshold of >" + str(args.loci_called) + " alleles/positions called...", file = log)
 
 		allele_mx = pandas.read_table(allele_filename, dtype = str)
-		report_allele_mx = {}
-
 		len_schema = len(allele_mx.columns) - 1
-
+
+		missing_counts = allele_mx.isin(["0"]).sum(axis=1)
+		called_counts = len_schema - missing_counts
+
+		report_allele_mx = {}		
 		report_allele_mx["samples"] = allele_mx[allele_mx.columns[0]]
-		report_allele_mx["missing"] = allele_mx.isin(["0"]).sum(axis=1)
-		report_allele_mx["called"] = len_schema - allele_mx.isin(["0"]).sum(axis=1)
-		report_allele_mx["pct_called"] = (len_schema - allele_mx.isin(["0"]).sum(axis=1)) / len_schema
+		report_allele_mx["missing"] = missing_counts
+		report_allele_mx["called"] = called_counts
+		report_allele_mx["pct_called"] = called_counts / len_schema
 
 		report_allele_df = pandas.DataFrame(data = report_allele_mx)
-		if float(args.loci_called) != 1.0:
-			flt_report = report_allele_df[report_allele_df["pct_called"] > float(args.loci_called)]
-		else:
-			flt_report = report_allele_df[report_allele_df["pct_called"] == float(args.loci_called)]
+
+		flt_report = report_allele_df[report_allele_df["pct_called"] >= float(args.loci_called)]
+
 		pass_samples = flt_report["samples"].values.tolist()
 
 		print("\tFrom the " + str(len(allele_mx[allele_mx.columns[0]].values.tolist())) + " samples, " + str(len(pass_samples)) + " were kept in the profile matrix.")