diff --git a/bin/mut_profile.py b/bin/mut_profile.py index 2c8ff4a1..82c06503 100755 --- a/bin/mut_profile.py +++ b/bin/mut_profile.py @@ -5,12 +5,14 @@ import click import json import pandas as pd +import numpy as np -from utils import contexts_formatted +from utils import contexts_formatted, contexts_formatted_sigprofiler from utils import filter_maf -def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method = 'unique', pseudocount = 0): +def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method = 'unique', pseudocount = 0, + sigprofiler = False, per_sample = False): """ Compute mutational profile from the input data ***Remember to add some pseudocounts to the computation*** @@ -37,10 +39,8 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method annotated_minimal_maf.columns = ["SAMPLE_ID", "CONTEXT_MUT", "MUT_ID"] # count the mutations per sample and per context - counts_x_sample_context_long = annotated_minimal_maf.groupby(by = ["CONTEXT_MUT"])["MUT_ID"].count().reset_index() - counts_x_sample_matrix = counts_x_sample_context_long.set_index("CONTEXT_MUT") - counts_x_sample_matrix.columns = [sample_name] - + counts_x_sample_context_long = annotated_minimal_maf.groupby(by = ["SAMPLE_ID", "CONTEXT_MUT"])["MUT_ID"].count().reset_index() + counts_x_sample_context_long.columns = ["SAMPLE_ID", "CONTEXT_MUT", "COUNT"] elif method == 'multiple': # make sure to count each mutation only once (avoid annotation issues) @@ -48,9 +48,13 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method annotated_minimal_maf.columns = ["SAMPLE_ID", "CONTEXT_MUT", "MUT_ID", "ALT_DEPTH"] # count the mutations per sample and per context - counts_x_sample_context_long = annotated_minimal_maf.groupby(by = ["CONTEXT_MUT"])["ALT_DEPTH"].sum().reset_index() - counts_x_sample_matrix = counts_x_sample_context_long.set_index("CONTEXT_MUT") - counts_x_sample_matrix.columns = [sample_name] + counts_x_sample_context_long = annotated_minimal_maf.groupby(by = ["SAMPLE_ID", "CONTEXT_MUT"])["ALT_DEPTH"].sum().reset_index() + counts_x_sample_context_long.columns = ["SAMPLE_ID", "CONTEXT_MUT", "COUNT"] + + # here we group the counts of all the samples + counts_x_sample_matrix = counts_x_sample_context_long.groupby(by = ["CONTEXT_MUT"])["COUNT"].sum().reset_index() + counts_x_sample_matrix.columns = ["CONTEXT_MUT", sample_name] + counts_x_sample_matrix = counts_x_sample_matrix.set_index("CONTEXT_MUT") counts_x_sample_matrix = pd.concat( (empty_matrix, counts_x_sample_matrix) , axis = 1) counts_x_sample_matrix = counts_x_sample_matrix.fillna(0) @@ -64,8 +68,38 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method index = True, sep = "\t") - -def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, json_output, plot): + if sigprofiler: + counts_x_sample_matrix.index = contexts_formatted_sigprofiler + counts_x_sample_matrix.index.name = "CONTEXT_MUT" + counts_x_sample_matrix = counts_x_sample_matrix.reset_index().sort_values(by = "CONTEXT_MUT") + counts_x_sample_matrix.to_csv(f"{mutation_matrix}.single.sigprofiler", + header = True, + index = False, + sep = "\t") + + if per_sample: + counts_x_sample_matrix = counts_x_sample_context_long.pivot(index = "CONTEXT_MUT", columns = "SAMPLE_ID", values = "COUNT") + counts_x_sample_matrix = pd.concat( (empty_matrix, counts_x_sample_matrix) , axis = 1) + counts_x_sample_matrix = counts_x_sample_matrix.fillna(0) + counts_x_sample_matrix.index.name = "CONTEXT_MUT" + + counts_x_sample_matrix.to_csv(f"{mutation_matrix}.per_sample", + header = True, + index = True, + sep = "\t") + + if sigprofiler: + counts_x_sample_matrix.index = contexts_formatted_sigprofiler + counts_x_sample_matrix.index.name = "CONTEXT_MUT" + counts_x_sample_matrix = counts_x_sample_matrix.reset_index().sort_values(by = "CONTEXT_MUT") + counts_x_sample_matrix.to_csv(f"{mutation_matrix}.per_sample.sigprofiler", + header = True, + index = False, + sep = "\t") + + +def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, json_output, plot, + wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False): """ Compute mutational profile from the input data @@ -79,6 +113,7 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co # Load your mutation matrix mutation_matrix = pd.read_csv(mutation_matrix_file, sep = "\t", header = 0) mutation_matrix = mutation_matrix.set_index("CONTEXT_MUT") + total_mutations = np.sum(mutation_matrix[sample_name]) # Load the trinucleotide counts trinucleotide_counts = pd.read_csv(trinucleotide_counts_file, sep = "\t", header = 0) @@ -133,6 +168,38 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co ymax = max_freq, output_f = f'{sample_name}.profile.pdf') + if wgs: + if not wgs_trinucleotide_counts: + print("Invalid wgs_trinucleotide_counts, provide a correct file: ", wgs_trinucleotide_counts) + sys.exit("Invalid wgs_trinucleotide_counts, provide a correct file.") + + mut_probability["CONTEXT"] = mut_probability["CONTEXT_MUT"].apply( lambda x : x[:3]) + ref_trinuc32 = pd.read_csv(wgs_trinucleotide_counts, + sep = "\t", header = 0) + + profile_trinuc_merge = mut_probability.merge(ref_trinuc32, on = "CONTEXT") + profile_trinuc_merge["MUTS_WGS"] = profile_trinuc_merge[sample_name] * profile_trinuc_merge["COUNT"] + profile_trinuc_merge["SAMPLE_MUTS_WGS"] = profile_trinuc_merge["MUTS_WGS"] / np.sum(profile_trinuc_merge["MUTS_WGS"]) * total_mutations + profile_trinuc_clean = profile_trinuc_merge[["CONTEXT_MUT", "SAMPLE_MUTS_WGS"]].set_index("CONTEXT_MUT") + profile_trinuc_clean.index.name = "CONTEXT_MUT" + profile_trinuc_clean = profile_trinuc_clean.reindex(contexts_formatted) + profile_trinuc_clean.columns = [sample_name] + + profile_trinuc_clean.to_csv(f"{json_output}.matrix.WGS", + header = True, + index = True, + sep = "\t") + if sigprofiler: + profile_trinuc_clean.index = contexts_formatted_sigprofiler + profile_trinuc_clean.index.name = "CONTEXT_MUT" + profile_trinuc_clean = profile_trinuc_clean.reset_index().sort_values(by = "CONTEXT_MUT") + profile_trinuc_clean.to_csv(f"{json_output}.matrix.WGS.sigprofiler", + header = True, + index = False, + sep = "\t") + + + @click.command() @@ -142,24 +209,31 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co @click.option('--out_matrix', type=click.Path(), help='Output mutation matrix file') @click.option('--method', type=click.Choice(['unique', 'multiple']), default='unique') @click.option('--pseud', type=float, default=0.5) +@click.option('--per_sample', is_flag=True, help='Create a column for each sample in the input') @click.option('--mutation_matrix', type=click.Path(exists=True), help='Mutation matrix file (for profile mode)') @click.option('--trinucleotide_counts', type=click.Path(exists=True), help='Trinucleotide counts file (for profile mode)') @click.option('--out_profile', type=click.Path(), help='JSON output file (for profile mode)') @click.option('--plot', is_flag=True, help='Generate plot and save as PDF') +@click.option('--wgs', is_flag=True, help='Store matrix of mutation counts at WGS level') +@click.option('--wgs_trinucleotide_counts', type=click.Path(exists=True), help='Trinucleotide counts file of the WGS (for profile mode if WGS active)') + + +@click.option('--sigprofiler', is_flag=True, help='Store the index column using the SigProfiler format') -def main(mode, sample_name, mut_file, out_matrix, method, pseud, mutation_matrix, trinucleotide_counts, out_profile, plot): +def main(mode, sample_name, mut_file, out_matrix, method, pseud, sigprofiler, per_sample, mutation_matrix, + trinucleotide_counts, out_profile, plot, wgs, wgs_trinucleotide_counts): # TODO # add additional mode to normalize mutation counts for the genomic trinucleotide level if mode == 'matrix': click.echo(f"Running in matrix mode...") click.echo(f"Using the method: {method}") click.echo(f"Using the pseudocount: {pseud}") - compute_mutation_matrix(sample_name, mut_file, out_matrix, method, pseud) + compute_mutation_matrix(sample_name, mut_file, out_matrix, method, pseud, sigprofiler, per_sample) elif mode == 'profile': click.echo(f"Running in profile mode...") - compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts, out_profile, plot) + compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts, out_profile, plot, wgs, wgs_trinucleotide_counts, sigprofiler) click.echo("Profile computation completed.") else: diff --git a/bin/mutprof_3compute_mutability.py b/bin/mutprof_3compute_mutability.py index 8ffc7780..1f96e5fb 100755 --- a/bin/mutprof_3compute_mutability.py +++ b/bin/mutprof_3compute_mutability.py @@ -3,6 +3,8 @@ import sys import click import pandas as pd +import numpy as np +import concurrent.futures from bgreference import hg38 @@ -76,7 +78,7 @@ def generate_all_sites_with_context(input_bedfile): -def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_file, regions_bedfile, out_mutability): +def compute_mutabilities(sample_name, depths_file, mut_profile_file, regions_bedfile, out_mutability, adjusted = True): """ Compute mutational profile from the input data ***Remember to add some pseudocounts to the computation*** @@ -101,14 +103,18 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ coverage_samples = list(coverage_info_all_samples.columns[2:]) # Either load or compute a dataframe with all sites annotated - all_sites_to_be_included_annotated = generate_all_sites_with_context(regions_bedfile) + # all_sites_to_be_included_annotated = generate_all_sites_with_context(regions_bedfile) + all_sites_to_be_included_annotated = pd.read_csv(regions_bedfile, sep="\t", header=0, usecols = ["CHROM", "POS", "REF", "ALT", "GENE", "CONTEXT_MUT"], + dtype={"CHROM": str}) + + print("All loaded") # make sure that all the files have the chr prefix in the files - if not all_sites_to_be_included_annotated["CHROM"].iloc[0].startswith("chr"): - all_sites_to_be_included_annotated["CHROM"] = "chr" + all_sites_to_be_included_annotated["CHROM"] + if not str(all_sites_to_be_included_annotated["CHROM"].iloc[0]).startswith("chr"): + all_sites_to_be_included_annotated["CHROM"] = "chr" + all_sites_to_be_included_annotated["CHROM"].astype(str) - if not coverage_info_all_samples["CHROM"].iloc[0].startswith("chr"): - coverage_info_all_samples["CHROM"] = "chr" + coverage_info_all_samples["CHROM"] + if not str(coverage_info_all_samples["CHROM"].iloc[0]).startswith("chr"): + coverage_info_all_samples["CHROM"] = "chr" + coverage_info_all_samples["CHROM"].astype(str) if sample_name == "all_samples": @@ -124,6 +130,11 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ on = "CONTEXT_MUT", suffixes = ("_coverage", "_probability"), how = 'left') + print("Coverage and probability merged") + del all_sites_context_n_coverage + del coverage_info_all_samples + del all_sites_to_be_included_annotated + print("Remove unnecessary objects") # if sample_name != "all_samples": @@ -131,16 +142,30 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ # coverage_and_probability[f"{samp}_adjusted_probability"] = coverage_and_probability[f"{samp}_adjusted_probability"] * norm_mut_burden_sample list_of_samples_columns = [f"{sample_name}_adjusted_probability" ] - all_samples_mutability_per_site = coverage_and_probability[ ['CHROM', 'POS', 'REF', 'ALT'] + list_of_samples_columns ].fillna(0) + all_samples_mutability_per_site = coverage_and_probability[ ['CHROM', 'POS', 'REF', 'ALT', 'GENE'] + list_of_samples_columns ].fillna(0) all_samples_mutability_per_site = all_samples_mutability_per_site.sort_values( by = ['CHROM', 'POS', 'REF', 'ALT']).reset_index(drop = True) - sample_mutability_info = all_samples_mutability_per_site[['CHROM', 'POS', 'REF', 'ALT', f"{sample_name}_adjusted_probability"]] - print(sample_mutability_info[sample_mutability_info[f"{sample_name}_adjusted_probability"] > 0].head(50)) - sample_mutability_info.to_csv(out_mutability, - # f"{samp}.mutability_per_site.tsv", - sep = "\t", - header = False, - index = False) + sample_mutability_info = all_samples_mutability_per_site[['CHROM', 'POS', 'REF', 'ALT', 'GENE', f"{sample_name}_adjusted_probability"]] + + sample_mutability_info_store = sample_mutability_info.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT'], + keep='first').drop("GENE", axis = 1) + print("Mutabilities computed") + + # print(sample_mutability_info_store[sample_mutability_info_store[f"{sample_name}_adjusted_probability"] > 0].head(50)) + sample_mutability_info_store.to_csv(out_mutability, + sep = "\t", + header = False, + index = False) + print("mutability 1 stored") + + sample_mutability_info.to_csv(f"{out_mutability}.with_gene", + sep = "\t", + header = True, + index = False) + print("mutability 1 with gene stored") + + return sample_mutability_info + # else: @@ -173,6 +198,89 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ # header = False, # index = False) +def process_gene_chunk(chunk, sample_name, mutation_matrix_per_sample_gene_int): + processed_data_chunk = pd.DataFrame() + + for gene, gene_specific in chunk.groupby("GENE"): + if gene in mutation_matrix_per_sample_gene_int.index: + correction_factor = mutation_matrix_per_sample_gene_int.loc[gene, "COUNT"] + else: + correction_factor = 0 + + gene_specific[f"{sample_name}_adjusted_probability"] *= correction_factor + processed_data_chunk = pd.concat((processed_data_chunk, + gene_specific[['CHROM', 'POS', 'REF', 'ALT', f"{sample_name}_adjusted_probability"]] + ), + axis=0) + + return processed_data_chunk + + +def parallel_process_chunks(sample_name, mutation_matrix_per_sample_gene_int, sample_mutability_info_int, chunk_size=200): + + # Define the list of genes to process in chunks + genes_mutability = pd.unique(sample_mutability_info_int["GENE"]) + genes_mutated = pd.unique(mutation_matrix_per_sample_gene_int.index.values) + common_genes = list(set(genes_mutability) & set(genes_mutated)) + gene_chunks = np.array_split(common_genes, len(common_genes) // chunk_size + 1) + + with concurrent.futures.ProcessPoolExecutor() as executor: + chunks = [sample_mutability_info_int[sample_mutability_info_int["GENE"].isin(gene_chunk)] + for gene_chunk in gene_chunks] + print("chunks mutability created") + + chunks_matrix = [mutation_matrix_per_sample_gene_int[mutation_matrix_per_sample_gene_int.index.isin(gene_chunk)] + for gene_chunk in gene_chunks] + print("chunks matrix created") + + print("execution ready to start") + results = list(executor.map(process_gene_chunk, chunks, [sample_name] * len(chunks), chunks_matrix)) + print("execution finished") + + processed_data = pd.DataFrame() + for result in results: + processed_data = pd.concat((processed_data, result), axis=0) + + return processed_data + + +def adjust_mutabilities(sample_name, mutation_matrix_file, mutability_info, out_mutability_adjusted): + + # Load mutation profiles file + mutation_matrix = pd.read_csv(mutation_matrix_file, sep="\t", header=0) + print("mutation matrix loaded") + + mutation_matrix_per_sample_gene = mutation_matrix.groupby(by = ["SYMBOL"])["MUT_ID"].count().reset_index() + print("group by worked") + mutation_matrix_per_sample_gene.columns = ["SYMBOL", "COUNT"] + mutation_matrix_per_sample_gene = mutation_matrix_per_sample_gene.fillna(0) + mutation_matrix_per_sample_gene = mutation_matrix_per_sample_gene.set_index("SYMBOL") + mutation_matrix_per_sample_gene.index.name = "SYMBOL" + mutation_matrix_per_sample_gene = mutation_matrix_per_sample_gene / mutation_matrix_per_sample_gene.min() + print("normalization with minimum worked") + + + # Parallel processing + corrected_data = parallel_process_chunks(sample_name, mutation_matrix_per_sample_gene, mutability_info) + + + print("Corrected mutabilities") + del mutability_info + print("Removing second table") + corrected_data = corrected_data.sort_values(by = ['CHROM', 'POS', 'REF', 'ALT', f"{sample_name}_adjusted_probability"], + ascending = (True, True, True, True, False) + ) + print("Corrected data sorted") + + corrected_data = corrected_data.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT'], keep='first') + print("Corrected data duplicates removed") + + corrected_data.to_csv(out_mutability_adjusted, + sep = "\t", + header = False, + index = False) + + @click.command() @click.option('--sample_name', type=str, help='Name of the sample being processed.') @@ -181,6 +289,7 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ @click.option('--profile', type=click.Path(exists=True), help='Input mutational profile dataframe file.') @click.option('--bedfile', type=click.Path(exists=True), help='BED file of the regions.') @click.option('--out_mutability', type=click.Path(), help='Output mutability file.') +@click.option('--adjust_local_rate', is_flag=True, help='Generate an additional file with the mutabilities adjusted by the local mutation rate of each gene.') # @click.option('--json_filters', type=click.Path(exists=True), help='Input mutation filtering criteria file') # @click.option('--method', type=click.Choice(['unique', 'multiple']), default='unique') # @click.option('--pseud', type=float, default=0.5) @@ -188,12 +297,15 @@ def compute_mutabilities(sample_name, mutation_matrix, depths_file, mut_profile_ # @click.option('--trinucleotide_counts', type=click.Path(exists=True), help='Trinucleotide counts file (for profile mode)') # @click.option('--plot', is_flag=True, help='Generate plot and save as PDF') + # def main(mode, sample_name, mut_file, out_matrix, json_filters, method, pseud, mutation_matrix, trinucleotide_counts, out_profile, plot): -def main(sample_name, mutation_matrix, depths, profile, bedfile, out_mutability): +def main(sample_name, mutation_matrix, depths, profile, bedfile, out_mutability, adjust_local_rate): click.echo(f"Computing the mutabilities...") # click.echo(f"Using the pseudocount: {pseud}") - compute_mutabilities(sample_name, mutation_matrix, depths, profile, bedfile, out_mutability) + mutabilities_with_gene = compute_mutabilities(sample_name, depths, profile, bedfile, out_mutability) click.echo("Mutabilities computed.") + if adjust_local_rate: + adjust_mutabilities(sample_name, mutation_matrix, mutabilities_with_gene, f"{out_mutability}.adjusted") if __name__ == '__main__': main() diff --git a/bin/utils.py b/bin/utils.py index 38d305c3..8addab79 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -92,6 +92,14 @@ def filter_maf(maf_df, filter_criteria): contexts_no_change = ['ACA', 'ACC', 'ACG', 'ACT', 'CCA', 'CCC', 'CCG', 'CCT', 'GCA', 'GCC', 'GCG', 'GCT', 'TCA', 'TCC', 'TCG', 'TCT', 'ATA', 'ATC', 'ATG', 'ATT', 'CTA', 'CTC', 'CTG', 'CTT', 'GTA', 'GTC', 'GTG', 'GTT', 'TTA', 'TTC', 'TTG', 'TTT'] +contexts_formatted_sigprofiler = ['A[C>A]A', 'A[C>A]C', 'A[C>A]G', 'A[C>A]T', 'C[C>A]A', 'C[C>A]C', 'C[C>A]G', 'C[C>A]T', 'G[C>A]A', 'G[C>A]C', 'G[C>A]G', 'G[C>A]T', 'T[C>A]A', 'T[C>A]C', 'T[C>A]G', 'T[C>A]T', + 'A[C>G]A', 'A[C>G]C', 'A[C>G]G', 'A[C>G]T', 'C[C>G]A', 'C[C>G]C', 'C[C>G]G', 'C[C>G]T', 'G[C>G]A', 'G[C>G]C', 'G[C>G]G', 'G[C>G]T', 'T[C>G]A', 'T[C>G]C', 'T[C>G]G', 'T[C>G]T', + 'A[C>T]A', 'A[C>T]C', 'A[C>T]G', 'A[C>T]T', 'C[C>T]A', 'C[C>T]C', 'C[C>T]G', 'C[C>T]T', 'G[C>T]A', 'G[C>T]C', 'G[C>T]G', 'G[C>T]T', 'T[C>T]A', 'T[C>T]C', 'T[C>T]G', 'T[C>T]T', + 'A[T>A]A', 'A[T>A]C', 'A[T>A]G', 'A[T>A]T', 'C[T>A]A', 'C[T>A]C', 'C[T>A]G', 'C[T>A]T', 'G[T>A]A', 'G[T>A]C', 'G[T>A]G', 'G[T>A]T', 'T[T>A]A', 'T[T>A]C', 'T[T>A]G', 'T[T>A]T', + 'A[T>C]A', 'A[T>C]C', 'A[T>C]G', 'A[T>C]T', 'C[T>C]A', 'C[T>C]C', 'C[T>C]G', 'C[T>C]T', 'G[T>C]A', 'G[T>C]C', 'G[T>C]G', 'G[T>C]T', 'T[T>C]A', 'T[T>C]C', 'T[T>C]G', 'T[T>C]T', + 'A[T>G]A', 'A[T>G]C', 'A[T>G]G', 'A[T>G]T', 'C[T>G]A', 'C[T>G]C', 'C[T>G]G', 'C[T>G]T', 'G[T>G]A', 'G[T>G]C', 'G[T>G]G', 'G[T>G]T', 'T[T>G]A', 'T[T>G]C', 'T[T>G]G', 'T[T>G]T'] + + # subs = [''.join(z) for z in itertools.product('CT', 'ACGT') if z[0] != z[1]] # flanks = [''.join(z) for z in itertools.product('ACGT', repeat=2)] # contexts_unformatted = sorted([(a, b) for a, b in itertools.product(subs, flanks)], key=lambda x: (x[0], x[1])) diff --git a/conf/base.config b/conf/base.config index 034ac0e5..5f9db70a 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,7 +16,7 @@ process { time = { check_max( 4.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 3 maxErrors = '-1' // Process-specific resource requirements @@ -62,4 +62,7 @@ process { withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } + withLabel:process_low_fixed_cpus { + cpus = { check_max( 2 , 'cpus' ) } + } } diff --git a/conf/modules.config b/conf/modules.config index bf5d3746..d801e46c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -84,9 +84,32 @@ process { ] } + withName: 'SUBSET_MUTABILITY' { + ext.filters = { ['"FILTER" : "notcontains n_rich"', + ['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), + '"TYPE" : "SNV"', + ['"DEPTH" : "ge ', "${params.depth_threshold}", '"'].join('').trim(), + '"FILTER": "notcontains no_pileup_support"', + '"FILTER": "notcontains low_mappability"', + '"FILTER": "notcontains repetitive_variant"', + '"FILTER": "notcontains other_sample_SNP"'].join(',\t').trim() + } + ext.output_fmt = { ['"header": true', + '"columns": ["SAMPLE_ID", "MUT_ID", "ALT_DEPTH", "SYMBOL"]', + '"colnames": ["SAMPLE_ID", "MUT_ID", "ALT_DEPTH", "SYMBOL"]' + ].join(',\t').trim() + } + + publishDir = [ + enabled : false + ] + } + withName: 'COMPUTE_MATRIX' { ext.args = '--method unique \ - --pseud 0.5' + --pseud 0.5 \ + --per_sample \ + --sigprofiler' } withName: 'COMPUTE_TRINUCLEOTIDE' { @@ -94,7 +117,11 @@ process { } withName: 'COMPUTE_PROFILE' { - ext.args = '--plot' + ext.args = "--plot --wgs --sigprofiler --wgs_trinucleotide_counts ${params.wgs_trinuc_counts} " + } + + withName: 'COMPUTE_MUTABILITY' { + ext.args = "--adjust_local_rate" } withName: 'MUTABILITY_BGZIPTABIX' { diff --git a/conf/test.config b/conf/test.config index a4b7ce38..0b9d9511 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,9 +16,9 @@ params { // TODO: add a real small test // // Limit resources so that this can run on GitHub Actions - // max_cpus = 2 - // max_memory = '6.GB' - // max_time = '6.h' + max_cpus = 16 + max_memory = '60.GB' + max_time = '6.h' // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets @@ -30,5 +30,5 @@ params { annotated_depth = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidney_annotated_depths_fake_duplexome.tsv.gz" bedf = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidneypanel4oncodrivefml.filtered.lowcomp.filtered.unmappable.bed5.header.bed" - + bedf_annotated = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidney.target_bed.tab.compact.tsv" } diff --git a/conf/test_duplexome.config b/conf/test_duplexome.config index 28d7a894..b1083cbf 100644 --- a/conf/test_duplexome.config +++ b/conf/test_duplexome.config @@ -25,5 +25,6 @@ params { annotated_depth = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidney_annotated_depths_duplexome.tsv.gz" bedf = "/workspace/datasets/prominent/metadata/regions/data/oncodrivefml/xgen-exome-hyb-panel-v2-targets-hg38.oncodrivefml.bed5.bed" + bedf_annotated = "/workspace/datasets/prominent/metadata/regions/data/possible_sites/exome.annotation_summary.omega.tsv.gz" } diff --git a/modules/local/bbgtools/oncodrivefml/main.nf b/modules/local/bbgtools/oncodrivefml/main.nf index 4a172b8a..ce489a4f 100644 --- a/modules/local/bbgtools/oncodrivefml/main.nf +++ b/modules/local/bbgtools/oncodrivefml/main.nf @@ -6,7 +6,7 @@ process ONCODRIVEFML { // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': // 'biocontainers/YOUR-TOOL-HERE' }" - container 'docker.io/ferriolcalvet/oncodrivefml:mutabilities' + container 'docker.io/ferriolcalvet/oncodrivefml:latest' input: tuple val(meta), path(mutations), path(mutabilities), path(mutabilities_ind) diff --git a/modules/local/compute_mutability/main.nf b/modules/local/compute_mutability/main.nf index c7a1d1c8..cc9bb619 100644 --- a/modules/local/compute_mutability/main.nf +++ b/modules/local/compute_mutability/main.nf @@ -1,7 +1,7 @@ process COMPUTE_MUTABILITY { tag "$meta.id" - label 'process_single' + label 'process_low_fixed_cpus' label 'process_high_memory' // // conda "YOUR-TOOL-HERE" @@ -15,8 +15,14 @@ process COMPUTE_MUTABILITY { tuple val(meta2), path(depths) output: - tuple val(meta), path("*.mutability_per_site.tsv") , emit: mutability - path "versions.yml" , emit: versions + // TODO revise this to see which one is outputed and why + tuple val(meta), path("*.mutability_per_site.tsv") , emit: mutability_not_adjusted + tuple val(meta), path("*.mutability_per_site.tsv.adjusted") , emit: mutability + path "versions.yml" , emit: versions + + // tuple val(meta), path("*.mutability_per_site.tsv") , emit: mutability + // tuple val(meta), path("*.mutability_per_site.tsv.adjusted") , optional:true , emit: mutability_adjusted + // path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -31,7 +37,7 @@ process COMPUTE_MUTABILITY { --mutation_matrix ${matrix} \\ --depths ${depths} \\ --profile ${mut_profile} \\ - --bedfile ${params.bedf} \\ + --bedfile ${params.bedf_annotated} \\ --out_mutability ${prefix}.mutability_per_site.tsv \\ ${args} cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/compute_profile/main.nf b/modules/local/compute_profile/main.nf index 4ce0b872..a30f4bd0 100644 --- a/modules/local/compute_profile/main.nf +++ b/modules/local/compute_profile/main.nf @@ -13,9 +13,11 @@ process COMPUTE_PROFILE { tuple val(meta), path(matrix), path(trinucleotide) output: - tuple val(meta), path("*.profile.tsv") , emit: profile - tuple val(meta), path("*.pdf") , emit: plots, optional:true - path "versions.yml" , emit: versions + tuple val(meta), path("*.profile.tsv") , emit: profile + tuple val(meta), path("*.pdf") , optional:true , emit: plots + tuple val(meta), path("*.matrix.WGS") , optional:true , emit: wgs + tuple val(meta), path("*.matrix.WGS.sigprofiler") , optional:true , emit: wgs_sigprofiler + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/mutation_matrix/main.nf b/modules/local/mutation_matrix/main.nf index c96674f8..8da85512 100644 --- a/modules/local/mutation_matrix/main.nf +++ b/modules/local/mutation_matrix/main.nf @@ -13,8 +13,11 @@ process COMPUTE_MATRIX { tuple val(meta), path(mut_files) output: - tuple val(meta), path("*.matrix.tsv") , emit: matrix - path "versions.yml" , emit: versions + tuple val(meta), path("*.matrix.tsv") , emit: matrix + tuple val(meta), path("*.single.sigprofiler") , optional:true, emit: single_sigprof + tuple val(meta), path("*.per_sample") , optional:true, emit: per_sample + tuple val(meta), path("*.per_sample.sigprofiler") , optional:true, emit: per_sample_sigprof + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/sig_matrix_concat/main.nf b/modules/local/sig_matrix_concat/main.nf new file mode 100644 index 00000000..452003df --- /dev/null +++ b/modules/local/sig_matrix_concat/main.nf @@ -0,0 +1,52 @@ +process MATRIX_CONCAT { + tag "$meta.id" + label 'process_high' + + // // conda "YOUR-TOOL-HERE" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + // 'biocontainers/YOUR-TOOL-HERE' }" + container 'docker.io/ferriolcalvet/bgreference' + + input: + tuple val(meta), path(matrix_files) + + output: + tuple val(meta), path("*.wgs.tsv") , emit: wgs_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #for file in ${matrix_files}; do + for file in *.WGS.sigprofiler; do + if [ -s ${prefix}.final_matrix.wgs.tsv ]; then + paste \$file <(cut -f 2- ${prefix}.final_matrix.wgs.tsv) > final_matrix.tsv.tmp; + else + mv \$file final_matrix.tsv.tmp; + fi + mv final_matrix.tsv.tmp ${prefix}.final_matrix.wgs.tsv; + done; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.final_matrix.wgs.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/sig_matrix_concat/meta.yml b/modules/local/sig_matrix_concat/meta.yml new file mode 100644 index 00000000..e69de29b diff --git a/modules/local/signatures/sigprofiler/assignment/main.nf b/modules/local/signatures/sigprofiler/assignment/main.nf index 0d2e30ed..9f6e6d88 100644 --- a/modules/local/signatures/sigprofiler/assignment/main.nf +++ b/modules/local/signatures/sigprofiler/assignment/main.nf @@ -1,47 +1,18 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process SIGPROFILERASSIGNMENT { tag "$meta.id" - label 'process_single' + label 'process_medium' + - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "YOUR-TOOL-HERE" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + container 'docker.io/ferriolcalvet/sigprofilerassignment' input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + tuple val(meta), path(matrix) + path(reference_signatures) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam - // TODO nf-core: List additional required output channels/values here - path "versions.yml" , emit: versions + tuple val(meta), path("**.pdf"), emit: plots + tuple val(meta), path("**.txt"), emit: stats + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -49,23 +20,15 @@ process SIGPROFILERASSIGNMENT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + // python -c "from SigProfilerAssignment import Analyzer as Analyze; Analyze.cosmic_fit('${matrix}', 'output', input_type='matrix', context_type='96', + // collapse_to_SBS96=True, cosmic_version=3.4, exome=False, + // genome_build="GRCh38", signature_database='${reference_signatures}', + // exclude_signature_subgroups=None, export_probabilities=False, + // export_probabilities_per_mutation=False, make_plots=True, + // sample_reconstruction_plots=False, verbose=False)" """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam + #python -c "from SigProfilerAssignment import Analyzer as Analyze; Analyze.cosmic_fit('${matrix}', 'output', input_type='matrix', context_type='96', signature_database='${reference_signatures}', genome_build='GRCh38', sample_reconstruction_plots= 'pdf', exclude_signature_subgroups=['Chemotherapy_signatures','Immunosuppressants_signatures'])" + python -c "from SigProfilerAssignment import Analyzer as Analyze; Analyze.cosmic_fit('${matrix}', 'output', input_type='matrix', context_type='96', genome_build='GRCh38', exclude_signature_subgroups=['Chemotherapy_signatures','Immunosuppressants_signatures'])" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -76,12 +39,8 @@ process SIGPROFILERASSIGNMENT { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: A stub section should mimic the execution of the original module as best as possible - // Have a look at the following examples: - // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 - // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam + touch ${prefix}.pdf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 82c14ab9..e907d31d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,10 +19,14 @@ params { // depths annotated_depth = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidney_annotated_depths_fake_duplexome.tsv.gz" - wgs_trinucleotide_counts = "https://raw.githubusercontent.com/weghornlab/SigNet/main/signet/data/real_data/3mer_WG_hg38.txt" + + // wgs_trinucleotide_counts = "https://raw.githubusercontent.com/weghornlab/SigNet/main/signet/data/real_data/3mer_WG_hg38.txt" + cosmic_ref_signatures = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/COSMIC_v3.4_SBS_GRCh38.txt" + wgs_trinuc_counts = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/WGS_trinucleotide_counts.txt" // oncodrivefml bedf = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidneypanel4oncodrivefml.filtered.lowcomp.filtered.unmappable.bed5.header.bed" + bedf_annotated = "/workspace/datasets/transfer/raquel_to_ferriol/deepCSA_test/kidney.target_bed.tab.compact.tsv" // oncodrive3d datasets3d = "/workspace/projects/clustering_3d/clustering_3d/datasets" @@ -74,8 +78,8 @@ params { // Max resource options // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 + max_memory = '256.GB' + max_cpus = 56 max_time = '240.h' // Schema validation default options diff --git a/subworkflows/local/mutationprofile/main.nf b/subworkflows/local/mutationprofile/main.nf index 2327dfd7..5bc01375 100644 --- a/subworkflows/local/mutationprofile/main.nf +++ b/subworkflows/local/mutationprofile/main.nf @@ -4,6 +4,7 @@ // include { INTERSECT_BED as BED_INTERSECTNONPA } from '../../modules/local/bedtools/intersect/main' include { SUBSET_MAF as SUBSET_MUTPROFILE } from '../../../modules/local/subsetmaf/main' +include { SUBSET_MAF as SUBSET_MUTABILITY } from '../../../modules/local/subsetmaf/main' include { COMPUTE_MATRIX as COMPUTEMATRIX } from '../../../modules/local/mutation_matrix/main' include { COMPUTE_PROFILE as COMPUTEPROFILE } from '../../../modules/local/compute_profile/main' @@ -43,31 +44,44 @@ workflow MUTATIONAL_PROFILE { // .set { all_beds } SUBSET_MUTPROFILE(mutations) + SUBSET_MUTABILITY(mutations) COMPUTEMATRIX(SUBSET_MUTPROFILE.out.mutations) + COMPUTEMATRIX.out.per_sample_sigprof + .join( Channel.of([ [ id: "all_samples" ], [] ]) ) + .map{ it -> [ it[0], it[1]] } + .set{ sigprofiler_matrix } + COMPUTETRINUC(depth) COMPUTETRINUC.out.trinucleotides.flatten().map{ it -> [ [id : it.name.tokenize('.')[0]] , it] }.set{ named_trinucleotides } - COMPUTEMATRIX.out.matrix .join(named_trinucleotides) .set{ matrix_n_trinucleotide } COMPUTEPROFILE(matrix_n_trinucleotide) - COMPUTEMATRIX.out.matrix + SUBSET_MUTABILITY.out.mutations .join(COMPUTEPROFILE.out.profile) - .set{ matrix_n_profile } + .set{ mutations_n_profile } - COMPUTEMUTABILITY( matrix_n_profile, depth.first() ) + COMPUTEMUTABILITY( mutations_n_profile, depth.first() ) MUTABILITY_BGZIPTABIX( COMPUTEMUTABILITY.out.mutability ) + sigprofiler_empty = Channel.of([]) + sigprofiler_empty + .concat(COMPUTEPROFILE.out.wgs_sigprofiler) + .set{ sigprofiler_wgs } + emit: - profile = COMPUTEPROFILE.out.profile // channel: [ val(meta), file(profile) ] - mutability = MUTABILITY_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), file(mutabilities), file(mutabilities_index) ] - versions = ch_versions // channel: [ versions.yml ] + profile = COMPUTEPROFILE.out.profile // channel: [ val(meta), file(profile) ] + mutability = MUTABILITY_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), file(mutabilities), file(mutabilities_index) ] + matrix_sigprof = sigprofiler_matrix + trinucleotides = named_trinucleotides + wgs_sigprofiler = sigprofiler_wgs + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/signatures/main.nf b/subworkflows/local/signatures/main.nf index 30fe50e4..138fc8ba 100644 --- a/subworkflows/local/signatures/main.nf +++ b/subworkflows/local/signatures/main.nf @@ -1,21 +1,28 @@ -include { COMPUTEDEPTHS } from '../../modules/local/computedepths/main' +include { SIGPROFILERASSIGNMENT } from '../../../modules/local/signatures/sigprofiler/assignment/main' +include { MATRIX_CONCAT } from '../../../modules/local/sig_matrix_concat/main' -workflow DEPTH_ANALYSIS{ + +workflow SIGNATURES { take: // inputs - bam_list + matrix + reference_signatures main: // actual code ch_versions = Channel.empty() - COMPUTEDEPTHS(bam_list) + matrix.map{ it -> it[1] }.collect().map{ it -> [[ id:"all_samples" ], it]}.set{ matrix_all_samples } + + MATRIX_CONCAT(matrix_all_samples) + + SIGPROFILERASSIGNMENT(MATRIX_CONCAT.out.wgs_tsv, reference_signatures) // PROCESSDEPTHSTABLE() // PLOTDEPTHS() emit: - depths = COMPUTEDEPTHS.out.depths // channel: [ val(meta), file(depths) ] - versions = ch_versions // channel: [ versions.yml ] + plots = SIGPROFILERASSIGNMENT.out.plots // channel: [ val(meta), file(depths) ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index 8fd6c93a..72d641e3 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -46,7 +46,7 @@ include { ONCODRIVE3D_ANALYSIS as ONCODRIVE3D } from '../subworkflows/lo // include { OMEGA_ANALYSIS as OMEGA } from '../subworkflows/local/omega/main' // include { DEPTH_ANALYSIS as ONCODRIVECLUSTL } from '../subworkflows/local/depthanalysis/main' -// include { DEPTH_ANALYSIS as SIGNATURES } from '../subworkflows/local/depthanalysis/main' +include { SIGNATURES as SIGNATURES } from '../subworkflows/local/signatures/main' // include { DEPTH_ANALYSIS as MUTRATE } from '../subworkflows/local/depthanalysis/main' @@ -169,7 +169,8 @@ workflow DEEPCSA { // ONCODRIVECLUSTL() // Signature Analysis - // SIGNATURES() + SIGNATURES(MUTPROFILE.out.wgs_sigprofiler, params.cosmic_ref_signatures) + ch_versions = ch_versions.mix(SIGNATURES.out.versions) // Mutation Rate // MUTRATE()