From 92ba2b040c799dc0f6822c26548a553d3c5b68ed Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Wed, 5 Feb 2025 13:58:38 +0100
Subject: [PATCH 1/3] update filters and add missing params to schema

---
 bin/filter_cohort.py            | 48 ++++++---------------------------
 conf/modules.config             |  4 +--
 modules/local/filtermaf/main.nf |  6 ++---
 nextflow.config                 |  4 +++
 nextflow_schema.json            | 38 ++++++++++++++++++++++++++
 5 files changed, 54 insertions(+), 46 deletions(-)

diff --git a/bin/filter_cohort.py b/bin/filter_cohort.py
index 7f4b6d9b..6a6ddeef 100755
--- a/bin/filter_cohort.py
+++ b/bin/filter_cohort.py
@@ -7,8 +7,9 @@
 
 maf_df_file = sys.argv[1]
 samp_name = sys.argv[2]
-repetitive_variant_treshold = int(sys.argv[3])
+repetitive_variant_threshold = int(sys.argv[3])
 somatic_vaf_boundary = float(sys.argv[4])
+n_rich_cohort_proportion = float(sys.argv[5])
 
 maf_df = pd.read_csv(maf_df_file, compression='gzip', header = 0, sep='\t', na_values = custom_na_values)
 
@@ -16,50 +17,16 @@
 
 
 
-# def correct_vaf(maf):
-
-#     """
-#     Computes VAF_CORRECTED for the subset of variants satisfying 0 < VAF < 0.2
-#     Returns the input MAF with two new columns:
-#         VAF_CORRECTED with new corrected VAF else it copies the VAF
-#         IS_VAF_CORRECTED with a boolean that indicates whether the VAF has been corrected
-#     """
-
-#     # TODO revise the 0.2 VAF threshold to see if it can be kept across datasets
-#     df = maf[(0 < maf['VAF']) & (maf['VAF'] < 0.2)][['SAMPLE_ID', 'MUT_ID', 'VAF', 'DEPTH']]
-#     df = df.sort_values('DEPTH')
-#     N  = df.shape[0]
-#     df['VAF_ROLLING_MEAN'] = df['VAF'].rolling(N // 25).mean()
-#     df['VAF_ROLLING_STD'] = df['VAF'].rolling(N // 25).std()
-#     stable_mean = df['VAF_ROLLING_MEAN'].values[-1]
-#     stable_std  = df['VAF_ROLLING_STD'].values[-1]
-#     df['VAF_CORRECTED'] = df.apply(lambda r: (r['VAF'] - r['VAF_ROLLING_MEAN']) * (stable_std / r['VAF_ROLLING_STD']) + stable_mean, axis=1)
-#     df = maf.merge(df[['VAF_CORRECTED', 'MUT_ID', 'SAMPLE_ID']],
-#                                     on=['MUT_ID', 'SAMPLE_ID'],
-#                                     how='outer')
-#     df['IS_VAF_CORRECTED'] = ~df['VAF_CORRECTED'].isnull()
-#     df.loc[~df['IS_VAF_CORRECTED'], 'VAF_CORRECTED'] = df[~df['IS_VAF_CORRECTED']]['VAF'].values
-#     return df
-
-
-
-# #######
-# ###  Add a corrected VAF column
-# #######
-# maf_df = correct_vaf(maf_df)
-# print("VAF corrected")
-
-
 
 
 #######
 ###  Filter repetitive variants
 #######
 
-# TODO revise these numbers, the repetitive_variant_treshold is the boundary at which we start considering a mutation as "repetitive"
+# TODO revise these numbers, the repetitive_variant_threshold is the boundary at which we start considering a mutation as "repetitive"
 max_samples = len(pd.unique(maf_df["SAMPLE_ID"]))
 
-n_samples = list(range(repetitive_variant_treshold, max_samples + 1))
+n_samples = list(range(repetitive_variant_threshold, max_samples + 1))
 if len(n_samples) == 0:
     print("Not enough samples to identify potential repetitive variants!")
 
@@ -73,7 +40,7 @@
     maf_df_f_somatic["count"] = 1
     maf_df_f_somatic_pivot = maf_df_f_somatic.groupby("MUT_ID")["count"].sum().reset_index()
 
-    repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_treshold]["MUT_ID"]
+    repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_threshold]["MUT_ID"]
 
     maf_df["repetitive_variant"] = maf_df["MUT_ID"].isin(repetitive_variants)
 
@@ -95,6 +62,7 @@
     print("Not enough samples to identify cohort_n_rich mutations!")
 
 else:
+    proportion_of_samples = (max_samples * n_rich_cohort_proportion) // 1
 
     # work with already filtered df + somatic only to explore potential artifacts
     # take only variant and sample info from the df
@@ -105,7 +73,7 @@
                                         ].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min})
     n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns')
 
-    n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= 2].index)
+    n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= proportion_of_samples].index)
 
     maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars)
 
@@ -132,7 +100,7 @@
     maf_df['frequency'] = maf_df['frequency'].fillna(0)
     maf_df['VAF_Ns_threshold'] = maf_df['VAF_Ns_threshold'].fillna(1.1)
 
-    maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] > maf_df['VAF_Ns_threshold']
+    maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] >= maf_df['VAF_Ns_threshold']
 
     maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_threshold"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_threshold"], "cohort_n_rich_threshold"),
                                                                         axis = 1
diff --git a/conf/modules.config b/conf/modules.config
index fc388046..e444d07b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -316,7 +316,7 @@ process {
                                 ['"VAF_AM" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
-                                '"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support", "notcontains low_mappability" ]',
+                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]',
                                 '"VAF_distorted_expanded_sq" : false',
                                 ].join(',\t').trim()
                             }
@@ -336,7 +336,7 @@ process {
             ext.filters     = { [['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
-                                '"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support",  "notcontains low_mappability" ]',
+                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]',
                                 '"VAF_distorted_expanded_sq" : false',
                                 ].join(',\t').trim()
                             }
diff --git a/modules/local/filtermaf/main.nf b/modules/local/filtermaf/main.nf
index 5b7a2f35..d91f6fa6 100644
--- a/modules/local/filtermaf/main.nf
+++ b/modules/local/filtermaf/main.nf
@@ -20,12 +20,12 @@ process FILTER_BATCH {
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ""
     def prefix = task.ext.prefix ?: "${meta.id}"
     def repetitive_variant = task.ext.repetitive_variant ?: "${params.repetitive_variant_thres}"
     def germline_threshold = task.ext.germline_threshold ?: "${params.germline_threshold}"
+    def proportion_samples_nrich = task.ext.prop_samples_nrich ?: "${params.prop_samples_nrich}"
     """
-    filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold}
+    filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold} ${proportion_samples_nrich}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -34,8 +34,6 @@ process FILTER_BATCH {
     """
 
     stub:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
     """
     touch all_samples.cohort.filtered.tsv.gz
 
diff --git a/nextflow.config b/nextflow.config
index 4566de2f..44927fa3 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,6 +21,9 @@ params {
     custom_groups_file          = null
     custom_groups_separator     = 'tab'
 
+    customize_annotation        = false
+    custom_annotation_tsv       = ''
+
     use_custom_bedfile          = false
     custom_bedfile              = null
     use_custom_minimum_depth    = 0
@@ -76,6 +79,7 @@ params {
     mutation_depth_threshold   = 40
 
     repetitive_variant_thres   = 5
+    prop_samples_nrich          = 0.1
 
     cosmic_ref_signatures      = "COSMIC_v3.4_SBS_GRCh38.txt"
     wgs_trinuc_counts          = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
diff --git a/nextflow_schema.json b/nextflow_schema.json
index f64d2643..9fca6621 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -188,6 +188,30 @@
                 }
             }
         },
+        "custom_annotation_configs": {
+            "title": "Custom annotation configurations",
+            "type": "object",
+            "fa_icon": "fas fa-dna",
+            "description": "BED-like file to customize the annotation of variants falling in certain regions.",
+            "properties": {
+                "customize_annotation": {
+                    "type": "boolean",
+                    "description": "Do you want to use a custom BED file?",
+                    "fa_icon": "fas fa-book",
+                    "help_text": "Do you want to use a custom BED file?"
+                },
+                "custom_annotation_tsv": {
+                    "type": "string",
+                    "format": "file-path",
+                    "exists": true,
+                    "mimetype": "text/plain",
+                    "default": null,
+                    "description": "TSV file with the following columns: chromosome  start   end gene_name    impactful_mutations neutral_impact new_impact",
+                    "help_text": "TSV file with the following columns: chromosome  start   end gene_name    impactful_mutations neutral_impact new_impact",
+                    "fa_icon": "far fa-file-code"
+                }
+            }
+        },
         "hotspot_bedfile_configs": {
             "title": "Definition of hotspot regions configuration",
             "type": "object",
@@ -425,6 +449,13 @@
                     "default": 5,
                     "fa_icon": "fas fa-book",
                     "help_text": ""
+                },
+                "prop_samples_nrich": {
+                    "type": "number",
+                    "description": "Any variant that is n_rich in this proportion of samples is flagged as cohort_n_rich variant.",
+                    "default": 0.1,
+                    "fa_icon": "fas fa-book",
+                    "help_text": ""
                 }
             }
         },
@@ -545,6 +576,13 @@
                     "help_text": "",
                     "default": "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade"
                 },
+                "vep_params_panel": {
+                    "type": "string",
+                    "description": "Additional parameters for running Ensembl VEP for the panel annotation.",
+                    "fa_icon": "fas fa-book",
+                    "help_text": "",
+                    "default": "--no_stats --cache --offline --symbol --protein --canonical"
+                },
                 "download_cache": {
                     "type": "boolean",
                     "description": "Do you want to download the cache? Activate it only if necessary.",

From d47f4de4aaef9130eb90aa37135f336faa517705 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Wed, 5 Feb 2025 15:23:54 +0100
Subject: [PATCH 2/3] upd implementation of cohort_n_rich - define number of
 samples at at least 2

---
 bin/filter_cohort.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/filter_cohort.py b/bin/filter_cohort.py
index 6a6ddeef..a9717c8f 100755
--- a/bin/filter_cohort.py
+++ b/bin/filter_cohort.py
@@ -62,7 +62,8 @@
     print("Not enough samples to identify cohort_n_rich mutations!")
 
 else:
-    proportion_of_samples = (max_samples * n_rich_cohort_proportion) // 1
+    number_of_samples = max(2, (max_samples * n_rich_cohort_proportion) // 1)
+    print(f"flagging mutations that are n_rich in at least: {number_of_samples} samples as cohort_n_rich")
 
     # work with already filtered df + somatic only to explore potential artifacts
     # take only variant and sample info from the df
@@ -73,7 +74,7 @@
                                         ].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min})
     n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns')
 
-    n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= proportion_of_samples].index)
+    n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= number_of_samples].index)
 
     maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars)
 

From 3934583a889897037a15118fbb6bb77ab301d258 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Fri, 7 Feb 2025 11:04:04 +0100
Subject: [PATCH 3/3] apply not_covered filter to somatic mutations

---
 conf/modules.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index e444d07b..e76c7707 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -316,7 +316,7 @@ process {
                                 ['"VAF_AM" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
-                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]',
+                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]',
                                 '"VAF_distorted_expanded_sq" : false',
                                 ].join(',\t').trim()
                             }
@@ -336,7 +336,7 @@ process {
             ext.filters     = { [['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
                                 ['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
-                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability" ]',
+                                '"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]',
                                 '"VAF_distorted_expanded_sq" : false',
                                 ].join(',\t').trim()
                             }