From 92ba2b040c799dc0f6822c26548a553d3c5b68ed Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 5 Feb 2025 13:58:38 +0100 Subject: [PATCH 1/3] update filters and add missing params to schema --- bin/filter_cohort.py | 48 ++++++--------------------------- conf/modules.config | 4 +-- modules/local/filtermaf/main.nf | 6 ++--- nextflow.config | 4 +++ nextflow_schema.json | 38 ++++++++++++++++++++++++++ 5 files changed, 54 insertions(+), 46 deletions(-) diff --git a/bin/filter_cohort.py b/bin/filter_cohort.py index 7f4b6d9b..6a6ddeef 100755 --- a/bin/filter_cohort.py +++ b/bin/filter_cohort.py @@ -7,8 +7,9 @@ maf_df_file = sys.argv[1] samp_name = sys.argv[2] -repetitive_variant_treshold = int(sys.argv[3]) +repetitive_variant_threshold = int(sys.argv[3]) somatic_vaf_boundary = float(sys.argv[4]) +n_rich_cohort_proportion = float(sys.argv[5]) maf_df = pd.read_csv(maf_df_file, compression='gzip', header = 0, sep='\t', na_values = custom_na_values) @@ -16,50 +17,16 @@ -# def correct_vaf(maf): - -# """ -# Computes VAF_CORRECTED for the subset of variants satisfying 0 < VAF < 0.2 -# Returns the input MAF with two new columns: -# VAF_CORRECTED with new corrected VAF else it copies the VAF -# IS_VAF_CORRECTED with a boolean that indicates whether the VAF has been corrected -# """ - -# # TODO revise the 0.2 VAF threshold to see if it can be kept across datasets -# df = maf[(0 < maf['VAF']) & (maf['VAF'] < 0.2)][['SAMPLE_ID', 'MUT_ID', 'VAF', 'DEPTH']] -# df = df.sort_values('DEPTH') -# N = df.shape[0] -# df['VAF_ROLLING_MEAN'] = df['VAF'].rolling(N // 25).mean() -# df['VAF_ROLLING_STD'] = df['VAF'].rolling(N // 25).std() -# stable_mean = df['VAF_ROLLING_MEAN'].values[-1] -# stable_std = df['VAF_ROLLING_STD'].values[-1] -# df['VAF_CORRECTED'] = df.apply(lambda r: (r['VAF'] - r['VAF_ROLLING_MEAN']) * (stable_std / r['VAF_ROLLING_STD']) + stable_mean, axis=1) -# df = maf.merge(df[['VAF_CORRECTED', 'MUT_ID', 'SAMPLE_ID']], -# on=['MUT_ID', 'SAMPLE_ID'], -# how='outer') -# df['IS_VAF_CORRECTED'] = ~df['VAF_CORRECTED'].isnull() -# df.loc[~df['IS_VAF_CORRECTED'], 'VAF_CORRECTED'] = df[~df['IS_VAF_CORRECTED']]['VAF'].values -# return df - - - -# ####### -# ### Add a corrected VAF column -# ####### -# maf_df = correct_vaf(maf_df) -# print("VAF corrected") - - ####### ### Filter repetitive variants ####### -# TODO revise these numbers, the repetitive_variant_treshold is the boundary at which we start considering a mutation as "repetitive" +# TODO revise these numbers, the repetitive_variant_threshold is the boundary at which we start considering a mutation as "repetitive" max_samples = len(pd.unique(maf_df["SAMPLE_ID"])) -n_samples = list(range(repetitive_variant_treshold, max_samples + 1)) +n_samples = list(range(repetitive_variant_threshold, max_samples + 1)) if len(n_samples) == 0: print("Not enough samples to identify potential repetitive variants!") @@ -73,7 +40,7 @@ maf_df_f_somatic["count"] = 1 maf_df_f_somatic_pivot = maf_df_f_somatic.groupby("MUT_ID")["count"].sum().reset_index() - repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_treshold]["MUT_ID"] + repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_threshold]["MUT_ID"] maf_df["repetitive_variant"] = maf_df["MUT_ID"].isin(repetitive_variants) @@ -95,6 +62,7 @@ print("Not enough samples to identify cohort_n_rich mutations!") else: + proportion_of_samples = (max_samples * n_rich_cohort_proportion) // 1 # work with already filtered df + somatic only to explore potential artifacts # take only variant and sample info from the df @@ -105,7 +73,7 @@ ].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min}) n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns') - n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= 2].index) + n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= proportion_of_samples].index) maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars) @@ -132,7 +100,7 @@ maf_df['frequency'] = maf_df['frequency'].fillna(0) maf_df['VAF_Ns_threshold'] = maf_df['VAF_Ns_threshold'].fillna(1.1) - maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] > maf_df['VAF_Ns_threshold'] + maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] >= maf_df['VAF_Ns_threshold'] maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_threshold"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_threshold"], "cohort_n_rich_threshold"), axis = 1 diff --git a/conf/modules.config b/conf/modules.config index fc388046..e444d07b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -316,7 +316,7 @@ process { ['"VAF_AM" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(), - '"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support", "notcontains low_mappability" ]', + '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]', '"VAF_distorted_expanded_sq" : false', ].join(',\t').trim() } @@ -336,7 +336,7 @@ process { ext.filters = { [['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(), - '"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support", "notcontains low_mappability" ]', + '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]', '"VAF_distorted_expanded_sq" : false', ].join(',\t').trim() } diff --git a/modules/local/filtermaf/main.nf b/modules/local/filtermaf/main.nf index 5b7a2f35..d91f6fa6 100644 --- a/modules/local/filtermaf/main.nf +++ b/modules/local/filtermaf/main.nf @@ -20,12 +20,12 @@ process FILTER_BATCH { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: "" def prefix = task.ext.prefix ?: "${meta.id}" def repetitive_variant = task.ext.repetitive_variant ?: "${params.repetitive_variant_thres}" def germline_threshold = task.ext.germline_threshold ?: "${params.germline_threshold}" + def proportion_samples_nrich = task.ext.prop_samples_nrich ?: "${params.prop_samples_nrich}" """ - filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold} + filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold} ${proportion_samples_nrich} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -34,8 +34,6 @@ process FILTER_BATCH { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" """ touch all_samples.cohort.filtered.tsv.gz diff --git a/nextflow.config b/nextflow.config index 4566de2f..44927fa3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,9 @@ params { custom_groups_file = null custom_groups_separator = 'tab' + customize_annotation = false + custom_annotation_tsv = '' + use_custom_bedfile = false custom_bedfile = null use_custom_minimum_depth = 0 @@ -76,6 +79,7 @@ params { mutation_depth_threshold = 40 repetitive_variant_thres = 5 + prop_samples_nrich = 0.1 cosmic_ref_signatures = "COSMIC_v3.4_SBS_GRCh38.txt" wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv" diff --git a/nextflow_schema.json b/nextflow_schema.json index f64d2643..9fca6621 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -188,6 +188,30 @@ } } }, + "custom_annotation_configs": { + "title": "Custom annotation configurations", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "BED-like file to customize the annotation of variants falling in certain regions.", + "properties": { + "customize_annotation": { + "type": "boolean", + "description": "Do you want to use a custom BED file?", + "fa_icon": "fas fa-book", + "help_text": "Do you want to use a custom BED file?" + }, + "custom_annotation_tsv": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "default": null, + "description": "TSV file with the following columns: chromosome start end gene_name impactful_mutations neutral_impact new_impact", + "help_text": "TSV file with the following columns: chromosome start end gene_name impactful_mutations neutral_impact new_impact", + "fa_icon": "far fa-file-code" + } + } + }, "hotspot_bedfile_configs": { "title": "Definition of hotspot regions configuration", "type": "object", @@ -425,6 +449,13 @@ "default": 5, "fa_icon": "fas fa-book", "help_text": "" + }, + "prop_samples_nrich": { + "type": "number", + "description": "Any variant that is n_rich in this proportion of samples is flagged as cohort_n_rich variant.", + "default": 0.1, + "fa_icon": "fas fa-book", + "help_text": "" } } }, @@ -545,6 +576,13 @@ "help_text": "", "default": "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade" }, + "vep_params_panel": { + "type": "string", + "description": "Additional parameters for running Ensembl VEP for the panel annotation.", + "fa_icon": "fas fa-book", + "help_text": "", + "default": "--no_stats --cache --offline --symbol --protein --canonical" + }, "download_cache": { "type": "boolean", "description": "Do you want to download the cache? Activate it only if necessary.", From d47f4de4aaef9130eb90aa37135f336faa517705 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 5 Feb 2025 15:23:54 +0100 Subject: [PATCH 2/3] upd implementation of cohort_n_rich - define number of samples at at least 2 --- bin/filter_cohort.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/filter_cohort.py b/bin/filter_cohort.py index 6a6ddeef..a9717c8f 100755 --- a/bin/filter_cohort.py +++ b/bin/filter_cohort.py @@ -62,7 +62,8 @@ print("Not enough samples to identify cohort_n_rich mutations!") else: - proportion_of_samples = (max_samples * n_rich_cohort_proportion) // 1 + number_of_samples = max(2, (max_samples * n_rich_cohort_proportion) // 1) + print(f"flagging mutations that are n_rich in at least: {number_of_samples} samples as cohort_n_rich") # work with already filtered df + somatic only to explore potential artifacts # take only variant and sample info from the df @@ -73,7 +74,7 @@ ].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min}) n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns') - n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= proportion_of_samples].index) + n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= number_of_samples].index) maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars) From 3934583a889897037a15118fbb6bb77ab301d258 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Fri, 7 Feb 2025 11:04:04 +0100 Subject: [PATCH 3/3] apply not_covered filter to somatic mutations --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index e444d07b..e76c7707 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -316,7 +316,7 @@ process { ['"VAF_AM" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(), - '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]', + '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]', '"VAF_distorted_expanded_sq" : false', ].join(',\t').trim() } @@ -336,7 +336,7 @@ process { ext.filters = { [['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(), ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(), - '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]', + '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]', '"VAF_distorted_expanded_sq" : false', ].join(',\t').trim() }