From 8b772625ea114f1568c0a12cd9c8590a08abd130 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferran=20Mui=C3=B1os?=
 <70538316+koszulordie@users.noreply.github.com>
Date: Tue, 18 Mar 2025 17:13:58 +0100
Subject: [PATCH 1/7] update mutated genomes from VAF module

mutgenomes_driver_priority.py command-line has a new option --recoded-genes that accepts comma-separated gene names for which all consequence types will be recoded to "missense"
---
 modules/local/mutated_genomes_from_vaf/main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/local/mutated_genomes_from_vaf/main.nf b/modules/local/mutated_genomes_from_vaf/main.nf
index 4971942a..71056e44 100644
--- a/modules/local/mutated_genomes_from_vaf/main.nf
+++ b/modules/local/mutated_genomes_from_vaf/main.nf
@@ -21,7 +21,8 @@ process MUTATED_GENOMES_FROM_VAF {
     mutgenomes_driver_priority.py \\
                 --sample ${prefix} \\
                 --somatic-mutations-file ${mutations} \\
-                --omega-file ${omegas} ;
+                --omega-file ${omegas} \\
+                --recoded-genes TERT ;
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From e6e5da1a7a6a3ed8423c1c8508ff9806449edb92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferran=20Mui=C3=B1os?=
 <70538316+koszulordie@users.noreply.github.com>
Date: Tue, 18 Mar 2025 17:28:08 +0100
Subject: [PATCH 2/7] update mutgenomes_driver_priority.py to allow csqn type
 recoding

---
 bin/mutgenomes_driver_priority.py | 186 +++++++++++++-----------------
 1 file changed, 80 insertions(+), 106 deletions(-)

diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py
index 4dded253..d5cddae9 100755
--- a/bin/mutgenomes_driver_priority.py
+++ b/bin/mutgenomes_driver_priority.py
@@ -12,6 +12,7 @@
 from utils import inclusion_exclusion
 from utils_impacts import broadimpact_grouping_dict
 
+
 def compute_prior(alt_read_count, depth):
 
     mean_p = (1 + alt_read_count.sum()) / depth.sum()
@@ -72,21 +73,34 @@ def compute_covered_genomes(alt_read_count, depth):
     return res
 
 
-def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
+def recode_csqn_type(df, gene, new_csqn_type):
+    
+    df['canonical_Consequence_broader'] = df.apply(lambda s: \
+        new_csqn_type if s['canonical_SYMBOL'] == gene \
+        else s['canonical_Consequence_broader'], 
+        axis=1)
+    return df
+    
+
+def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
-    consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
-    consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
-
+    
+    # recode csqn type
+    for gene in recoded_genes:
+        somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
+        print(gene, somatic_mutations[somatic_mutations['canonical_SYMBOL'] == gene]['canonical_Consequence_broader'].unique())
+    
     mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV')
-                                    & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
+                                    & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense']))
                                     ].reset_index(drop = True)
+    mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1)
 
-    gene_chr_dict = {x: y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values}
+    gene_chr_dict = {x: y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values}
 
     # parse dN/dS excess
-    excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts)
+    excess_dict = compute_excess(omega_file)
     genes_with_omega = list(excess_dict.keys())
 
     # select mutations
@@ -97,53 +111,32 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
     for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
         res_dict[f'GENOMES_SNV_AM_{suffix}'] = []
 
-    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique())))
+    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique())))
 
     for gene in genes_with_mutations_n_omega:
 
-        for csqn in chosen_impacts:
+        for csqn in ['missense', 'nonsense']:
 
-            # total
+            df_dict = {}
 
-            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
+            df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)]
             df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
+            df_dict['TOTAL'] = df_all
 
-            # upper
-
-            ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_upper = df_all.loc[:ind - 1].copy()
-            except:
-                df_upper = pd.DataFrame(columns=df_all.columns)
-
-            # mean
-
-            ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_mean = df_all.loc[:ind - 1].copy()
-            except:
-                df_mean = pd.DataFrame(columns=df_all.columns)
-
-            # lower
-
-            ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_lower = df_all.loc[:ind - 1].copy()
-            except:
-                df_lower = pd.DataFrame(columns=df_all.columns)
+            for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
+                ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
+                try:
+                    assert(ind >= 1)
+                    dg = df_all.loc[:ind - 1].copy()
+                except:
+                    dg = pd.DataFrame(columns=df_all.columns)
+                df_dict[bound] = dg
 
             res_dict['gene'] = res_dict['gene'] + [gene]
             res_dict['impact'] = res_dict['impact'] + [csqn]
             res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
 
-            # summarize in a dictionary
-
-            df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}
-
             for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
 
                 if df_dict[suffix].shape[0] == 0:
@@ -156,17 +149,17 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
                     res_dict[f'GENOMES_SNV_AM_{suffix}'] = res_dict[f'GENOMES_SNV_AM_{suffix}'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
-    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df[['SAMPLE'] + all_columnss]
-
+    return df
+    
 
 def indel_am(sample, somatic_mutations_file):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
     mutations = somatic_mutations[(somatic_mutations['TYPE'].isin(['INSERTION', 'DELETION']))].reset_index(drop = True)
-    gene_chr_dict = {x: y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values}
+    mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1)
+    gene_chr_dict = {x: y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values}
 
     # select mutations
 
@@ -175,8 +168,8 @@ def indel_am(sample, somatic_mutations_file):
     res_dict['chr'] = []
     res_dict['impact'] = []
     res_dict['GENOMES_INDEL_AM_TOTAL'] = []
-
-    for gene in mutations['GENE'].unique():
+        
+    for gene in mutations['canonical_SYMBOL'].unique():
 
         res_dict['gene'] = res_dict['gene'] + [gene]
         res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
@@ -184,7 +177,7 @@ def indel_am(sample, somatic_mutations_file):
 
         # total
 
-        df_all = mutations[(mutations['GENE'] == gene)]
+        df_all = mutations[(mutations['canonical_SYMBOL'] == gene)]
         df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
         df_all.reset_index(drop=True, inplace=True)
 
@@ -193,40 +186,39 @@ def indel_am(sample, somatic_mutations_file):
         df_dict = {'TOTAL': df_all}
 
         if df_dict['TOTAL'].shape[0] == 0:
-
             res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [0.]
-
         else:
-
             depth = df_dict['TOTAL']['DEPTH_AM'].astype(np.float32).values
             alt_read_count = df_dict['TOTAL']['ALT_DEPTH_AM'].astype(np.float32).values
             res = compute_covered_genomes(alt_read_count, depth)
             res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
-    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df[['SAMPLE'] + all_columnss]
+    return df
+    
 
-
-def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
+def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
-    consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
-    consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
+
+    # recode csqn type
+    for gene in recoded_genes:
+        somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
 
     mutations = somatic_mutations[
         (somatic_mutations['TYPE'] == 'SNV')
-        & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
+        & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense']))
         ].reset_index(drop = True)
+    mutations['VAF_ND'] = mutations.apply(lambda r: r['ALT_DEPTH_ND'] / r['DEPTH_ND'], axis=1)
 
-    gene_chr_dict = { x : y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values }
+    gene_chr_dict = { x : y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values }
 
     mutations = mutations[mutations['ALT_DEPTH_ND'] > 0]
 
     # parse dN/dS excess
-    excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts)
+    excess_dict = compute_excess(omega_file)
     genes_with_omega = list(excess_dict.keys())
 
     # select mutations
@@ -238,53 +230,32 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
     for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
         res_dict[f'GENOMES_SNV_ND_{suffix}'] = []
 
-    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique())))
-
+    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique())))
+    
     for gene in genes_with_mutations_n_omega:
 
-        for csqn in chosen_impacts:
+        for csqn in ['missense', 'nonsense']:
 
-            # total
+            df_dict = {}
 
-            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
+            df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)]
             df_all = df_all.sort_values(by=['VAF_ND'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
+            df_dict['TOTAL'] = df_all
 
-            # upper
-
-            ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_upper = df_all.loc[:ind - 1].copy()
-            except:
-                df_upper = pd.DataFrame(columns=df_all.columns)
-
-            # mean
-
-            ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_mean = df_all.loc[:ind - 1].copy()
-            except:
-                df_mean = pd.DataFrame(columns=df_all.columns)
-
-            # lower
-
-            ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_lower = df_all.loc[:ind - 1].copy()
-            except:
-                df_lower = pd.DataFrame(columns=df_all.columns)
+            for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
+                ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
+                try:
+                    assert(ind >= 1)
+                    dg = df_all.loc[:ind - 1].copy()
+                except:
+                    dg = pd.DataFrame(columns=df_all.columns)
+                df_dict[bound] = dg
 
             res_dict['gene'] = res_dict['gene'] + [gene]
             res_dict['impact'] = res_dict['impact'] + [csqn]
             res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
 
-            # summarize in a dictionary
-
-            df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}
-
             for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
 
                 if df_dict[suffix].shape[0] == 0:
@@ -299,28 +270,31 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
                     res_dict[f'GENOMES_SNV_ND_{suffix}'] = res_dict[f'GENOMES_SNV_ND_{suffix}'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
-    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df[['SAMPLE'] + all_columnss]
-
+    return df
+    
 
 @click.command()
 @click.option('--sample')
 @click.option('--somatic-mutations-file')
 @click.option('--omega-file')
-def run_all(sample, somatic_mutations_file, omega_file):
+@click.option('--recoded-genes', default='', type=str)
+def run_all(sample, somatic_mutations_file, omega_file, recoded_genes):
     """
-    intent:
-    script that combines the 3 covered epithelium functions,
-    merges the output in a standardized way, so that we just
-    get one output in the mutated_genomes_from_vaf/main.nf
+    intent: 
+    script that combines the 3 covered epithelium functions, 
+    merges the output in a standardized way, so that we just 
+    get one output in the mutated_genomes_from_vaf/main.nf 
     nextflow module
-
+    
     """
 
-    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file)
+    # recodification of csqn type is only applied to SNVs
+    recoded_genes = recoded_genes.split(',')
+
+    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes)
     df_indel_am = indel_am(sample, somatic_mutations_file)
-    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file)
+    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes)
 
     all_dataframes = [df_snv_am, df_indel_am, df_snv_nd]
     df_merged = reduce(lambda  left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes)

From e9297b3ca9ddce522dee7d13036c9056ff555051 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Wed, 19 Mar 2025 20:38:05 +0100
Subject: [PATCH 3/7] add nextflow handling of recoding genes

- not tested
---
 modules/local/mutated_genomes_from_vaf/main.nf | 3 ++-
 nextflow.config                                | 1 +
 nextflow_schema.json                           | 7 ++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/local/mutated_genomes_from_vaf/main.nf b/modules/local/mutated_genomes_from_vaf/main.nf
index 71056e44..703a1b15 100644
--- a/modules/local/mutated_genomes_from_vaf/main.nf
+++ b/modules/local/mutated_genomes_from_vaf/main.nf
@@ -17,12 +17,13 @@ process MUTATED_GENOMES_FROM_VAF {
 
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def recode = task.ext.recode_list ? "--recoded-genes ${task.ext.recode_list}" : ""
     """
     mutgenomes_driver_priority.py \\
                 --sample ${prefix} \\
                 --somatic-mutations-file ${mutations} \\
                 --omega-file ${omegas} \\
-                --recoded-genes TERT ;
+                ${recode} ;
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 397867d0..0ed82967 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -53,6 +53,7 @@ params {
     signatures                  = false
     mutationrate                = false
     mutated_cells_vaf           = false
+    mutepi_genes_to_recode      = null
     expected_mutated_cells      = false
     dnds                        = false
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 689ea911..59d2451c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -319,9 +319,14 @@
                 },
                 "mutated_cells_vaf": {
                     "type": "boolean",
-                    "description": "Do you want to run mutated_cells_reads using the VAFs?",
+                    "description": "Do you want to compute the proportion of mutated cells using the VAFs?",
                     "fa_icon": "fas fa-book"
                 },
+                "mutepi_genes_to_recode": {
+                    "type": "string",
+                    "description": "Do you want to consider all mutations in a gene for the mutated epithelium from VAF?",
+                    "fa_icon": "fas fa-book"
+                },                
                 "expected_mutated_cells": {
                     "type": "boolean",
                     "description": "Do you want to run expected_mutated_cells?",

From e6eec4972b7f6a5c6d3cfb0b29691a228bcbea24 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Wed, 19 Mar 2025 20:43:44 +0100
Subject: [PATCH 4/7] fix missing param definition in modules.config

---
 conf/modules.config | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conf/modules.config b/conf/modules.config
index f44daad1..df29d849 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -496,6 +496,9 @@ process {
                 publishDir       = [
                         enabled : false
                 ]
+            }            
+            withName: 'MUTATEDGENOMESFROMVAFAM' {
+                ext.recode     = "${params.mutepi_genes_to_recode}"// "TERTpromoter"
             }
         }
 

From 1467e497b73bdeb5980ffcd2972d37d8177a1884 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Thu, 20 Mar 2025 10:17:02 +0100
Subject: [PATCH 5/7] update driver priority code

---
 bin/mutgenomes_driver_priority.py | 93 ++++++++++++++++---------------
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py
index d5cddae9..4b726d8e 100755
--- a/bin/mutgenomes_driver_priority.py
+++ b/bin/mutgenomes_driver_priority.py
@@ -75,32 +75,31 @@ def compute_covered_genomes(alt_read_count, depth):
 
 def recode_csqn_type(df, gene, new_csqn_type):
     
-    df['canonical_Consequence_broader'] = df.apply(lambda s: \
-        new_csqn_type if s['canonical_SYMBOL'] == gene \
-        else s['canonical_Consequence_broader'], 
+    df['Consequence_broader'] = df.apply(lambda s: \
+        new_csqn_type if s['GENE'] == gene \
+        else s['Consequence_broader'], 
         axis=1)
     return df
-    
-
-def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
+def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
-    
+    consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
+    consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
+
     # recode csqn type
     for gene in recoded_genes:
         somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
-        print(gene, somatic_mutations[somatic_mutations['canonical_SYMBOL'] == gene]['canonical_Consequence_broader'].unique())
+        print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())
     
     mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV')
-                                    & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense']))
+                                    & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
                                     ].reset_index(drop = True)
-    mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1)
 
-    gene_chr_dict = {x: y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values}
+    gene_chr_dict = {x: y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values}
 
     # parse dN/dS excess
-    excess_dict = compute_excess(omega_file)
+    excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts)
     genes_with_omega = list(excess_dict.keys())
 
     # select mutations
@@ -111,11 +110,11 @@ def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
     for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
         res_dict[f'GENOMES_SNV_AM_{suffix}'] = []
 
-    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique())))
+    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique())))
 
     for gene in genes_with_mutations_n_omega:
 
-        for csqn in ['missense', 'nonsense']:
+        for csqn in chosen_impacts:
 
             df_dict = {}
 
@@ -149,17 +148,17 @@ def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
                     res_dict[f'GENOMES_SNV_AM_{suffix}'] = res_dict[f'GENOMES_SNV_AM_{suffix}'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
+    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df
-    
+    return df[['SAMPLE'] + all_columnss]
+
 
 def indel_am(sample, somatic_mutations_file):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
     mutations = somatic_mutations[(somatic_mutations['TYPE'].isin(['INSERTION', 'DELETION']))].reset_index(drop = True)
-    mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1)
-    gene_chr_dict = {x: y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values}
+    gene_chr_dict = {x: y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values}
 
     # select mutations
 
@@ -168,8 +167,8 @@ def indel_am(sample, somatic_mutations_file):
     res_dict['chr'] = []
     res_dict['impact'] = []
     res_dict['GENOMES_INDEL_AM_TOTAL'] = []
-        
-    for gene in mutations['canonical_SYMBOL'].unique():
+
+    for gene in mutations['GENE'].unique():
 
         res_dict['gene'] = res_dict['gene'] + [gene]
         res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
@@ -177,7 +176,7 @@ def indel_am(sample, somatic_mutations_file):
 
         # total
 
-        df_all = mutations[(mutations['canonical_SYMBOL'] == gene)]
+        df_all = mutations[(mutations['GENE'] == gene)]
         df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
         df_all.reset_index(drop=True, inplace=True)
 
@@ -186,39 +185,45 @@ def indel_am(sample, somatic_mutations_file):
         df_dict = {'TOTAL': df_all}
 
         if df_dict['TOTAL'].shape[0] == 0:
+
             res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [0.]
+
         else:
+
             depth = df_dict['TOTAL']['DEPTH_AM'].astype(np.float32).values
             alt_read_count = df_dict['TOTAL']['ALT_DEPTH_AM'].astype(np.float32).values
             res = compute_covered_genomes(alt_read_count, depth)
             res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
+    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df
-    
+    return df[['SAMPLE'] + all_columnss]
+
 
-def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
+def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
+    consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
+    consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
 
     # recode csqn type
     for gene in recoded_genes:
         somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
-
+        print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())
+    
     mutations = somatic_mutations[
         (somatic_mutations['TYPE'] == 'SNV')
-        & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense']))
+        & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
         ].reset_index(drop = True)
-    mutations['VAF_ND'] = mutations.apply(lambda r: r['ALT_DEPTH_ND'] / r['DEPTH_ND'], axis=1)
-
-    gene_chr_dict = { x : y  for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values }
+    
+    gene_chr_dict = { x : y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values }
 
     mutations = mutations[mutations['ALT_DEPTH_ND'] > 0]
 
     # parse dN/dS excess
-    excess_dict = compute_excess(omega_file)
+    excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts)
     genes_with_omega = list(excess_dict.keys())
 
     # select mutations
@@ -230,15 +235,15 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
     for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
         res_dict[f'GENOMES_SNV_ND_{suffix}'] = []
 
-    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique())))
-    
+    genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique())))
+
     for gene in genes_with_mutations_n_omega:
 
-        for csqn in ['missense', 'nonsense']:
+        for csqn in chosen_impacts:
 
             df_dict = {}
 
-            df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)]
+            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
             df_all = df_all.sort_values(by=['VAF_ND'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
             df_dict['TOTAL'] = df_all
@@ -270,9 +275,10 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
                     res_dict[f'GENOMES_SNV_ND_{suffix}'] = res_dict[f'GENOMES_SNV_ND_{suffix}'] + [res['GENOMES']]
 
     df = pd.DataFrame(res_dict)
+    all_columnss = list(df.columns)
     df['SAMPLE'] = sample
-    return df
-    
+    return df[['SAMPLE'] + all_columnss]
+
 
 @click.command()
 @click.option('--sample')
@@ -281,20 +287,19 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]):
 @click.option('--recoded-genes', default='', type=str)
 def run_all(sample, somatic_mutations_file, omega_file, recoded_genes):
     """
-    intent: 
-    script that combines the 3 covered epithelium functions, 
-    merges the output in a standardized way, so that we just 
-    get one output in the mutated_genomes_from_vaf/main.nf 
+    intent:
+    script that combines the 3 covered epithelium functions,
+    merges the output in a standardized way, so that we just
+    get one output in the mutated_genomes_from_vaf/main.nf
     nextflow module
-    
-    """
 
+    """
     # recodification of csqn type is only applied to SNVs
     recoded_genes = recoded_genes.split(',')
 
-    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes)
+    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)
     df_indel_am = indel_am(sample, somatic_mutations_file)
-    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes)
+    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)
 
     all_dataframes = [df_snv_am, df_indel_am, df_snv_nd]
     df_merged = reduce(lambda  left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes)

From b64c7506de5484a14c9b3cc57776b6a1fd502628 Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Thu, 20 Mar 2025 10:29:30 +0100
Subject: [PATCH 6/7] fix missing update

-not tested
---
 bin/mutgenomes_driver_priority.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py
index 4b726d8e..11e04b99 100755
--- a/bin/mutgenomes_driver_priority.py
+++ b/bin/mutgenomes_driver_priority.py
@@ -118,7 +118,7 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
 
             df_dict = {}
 
-            df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)]
+            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'] == csqn)]
             df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
             df_dict['TOTAL'] = df_all

From ddaa08778e1ef4cdf573f0bae1d08e6f5570f80a Mon Sep 17 00:00:00 2001
From: FerriolCalvet <ferriolcalvet@gmail.com>
Date: Thu, 20 Mar 2025 16:27:25 +0100
Subject: [PATCH 7/7] update filtering in modules.config

- update handling of genes to recode param
---
 bin/mutgenomes_summary_tables.py | 6 +++---
 conf/modules.config              | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/bin/mutgenomes_summary_tables.py b/bin/mutgenomes_summary_tables.py
index f9b8566b..db070392 100755
--- a/bin/mutgenomes_summary_tables.py
+++ b/bin/mutgenomes_summary_tables.py
@@ -50,7 +50,7 @@ def gather(metadata_file):
 
     for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
         for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:
-            
+
             df_all_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated[f'{label}_{k}'].values
 
             df_all_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated.apply(
@@ -80,13 +80,13 @@ def gather(metadata_file):
 
     for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
         for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:
-        
+
             df_gene_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated[f'{label}_{k}'].values
 
             df_gene_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated.apply(
                 lambda r: r[f'{label}_{k}'] if ((r['chr'] == 'chrX') and (r['SEX'] == 'M')) else 2 * r[f'{label}_{k}'],
                 axis=1)
-    
+
     # indels
 
     label = 'GENOMES_INDEL_AM'
diff --git a/conf/modules.config b/conf/modules.config
index df29d849..bbd598c0 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -486,19 +486,17 @@ process {
     if (params.mutated_cells_vaf) {
         if (params.all_duplex_counts){
             withName: 'SUBSETMUTEPIVAFAM' {
-                ext.filters     = { ['"canonical_Protein_affecting": "protein_affecting"'].join(',\t').trim() }
                 ext.output_fmt  = { ['"header": true',
                                         '"columns": ["CHROM", "canonical_SYMBOL", "canonical_Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "canonical_Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]',
                                         '"colnames": ["CHROM", "GENE", "Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]'
                                     ].join(',\t').trim()
                                 }
-
                 publishDir       = [
                         enabled : false
                 ]
-            }            
+            }
             withName: 'MUTATEDGENOMESFROMVAFAM' {
-                ext.recode     = "${params.mutepi_genes_to_recode}"// "TERTpromoter"
+                ext.recode_list     = "${params.mutepi_genes_to_recode}"// "TERTpromoter"
             }
         }