bbglab · FerriolCalvet · Mar 20, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 19, 2025
diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py
@@ -12,6 +12,7 @@
 from utils import inclusion_exclusion
 from utils_impacts import broadimpact_grouping_dict
 
+
 def compute_prior(alt_read_count, depth):
 
     mean_p = (1 + alt_read_count.sum()) / depth.sum()
@@ -72,13 +73,25 @@ def compute_covered_genomes(alt_read_count, depth):
     return res
 
 
-def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
+def recode_csqn_type(df, gene, new_csqn_type):
+
+    df['Consequence_broader'] = df.apply(lambda s: \
+        new_csqn_type if s['GENE'] == gene \
+        else s['Consequence_broader'], 
+        axis=1)
+    return df
+def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
     consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
     consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
 
+    # recode csqn type
+    for gene in recoded_genes:
+        somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
+        print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())
+
     mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV')
                                     & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
                                     ].reset_index(drop = True)
@@ -103,47 +116,26 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
 
         for csqn in chosen_impacts:
 
-            # total
+            df_dict = {}
 
-            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
+            df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'] == csqn)]
             df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
+            df_dict['TOTAL'] = df_all
 
-            # upper
-
-            ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_upper = df_all.loc[:ind - 1].copy()
-            except:
-                df_upper = pd.DataFrame(columns=df_all.columns)
-
-            # mean
-
-            ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_mean = df_all.loc[:ind - 1].copy()
-            except:
-                df_mean = pd.DataFrame(columns=df_all.columns)
-
-            # lower
-
-            ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_lower = df_all.loc[:ind - 1].copy()
-            except:
-                df_lower = pd.DataFrame(columns=df_all.columns)
+            for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
+                ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
+                try:
+                    assert(ind >= 1)
+                    dg = df_all.loc[:ind - 1].copy()
+                except:
+                    dg = pd.DataFrame(columns=df_all.columns)
+                df_dict[bound] = dg
 
             res_dict['gene'] = res_dict['gene'] + [gene]
             res_dict['impact'] = res_dict['impact'] + [csqn]
             res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
 
-            # summarize in a dictionary
-
-            df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}
-
             for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
 
                 if df_dict[suffix].shape[0] == 0:
@@ -209,18 +201,23 @@ def indel_am(sample, somatic_mutations_file):
     return df[['SAMPLE'] + all_columnss]
 
 
-def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
+def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):
 
     # parse mutations
     somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
     consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
     consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])
 
+    # recode csqn type
+    for gene in recoded_genes:
+        somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
+        print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())
+
     mutations = somatic_mutations[
         (somatic_mutations['TYPE'] == 'SNV')
         & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
         ].reset_index(drop = True)
-
+    
     gene_chr_dict = { x : y  for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values }
 
     mutations = mutations[mutations['ALT_DEPTH_ND'] > 0]
@@ -244,47 +241,26 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
 
         for csqn in chosen_impacts:
 
-            # total
+            df_dict = {}
 
             df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
             df_all = df_all.sort_values(by=['VAF_ND'], ascending=False)
             df_all.reset_index(drop=True, inplace=True)
+            df_dict['TOTAL'] = df_all
 
-            # upper
-
-            ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_upper = df_all.loc[:ind - 1].copy()
-            except:
-                df_upper = pd.DataFrame(columns=df_all.columns)
-
-            # mean
-
-            ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_mean = df_all.loc[:ind - 1].copy()
-            except:
-                df_mean = pd.DataFrame(columns=df_all.columns)
-
-            # lower
-
-            ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
-            try:
-                assert(ind >= 1)
-                df_lower = df_all.loc[:ind - 1].copy()
-            except:
-                df_lower = pd.DataFrame(columns=df_all.columns)
+            for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
+                ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
+                try:
+                    assert(ind >= 1)
+                    dg = df_all.loc[:ind - 1].copy()
+                except:
+                    dg = pd.DataFrame(columns=df_all.columns)
+                df_dict[bound] = dg
 
             res_dict['gene'] = res_dict['gene'] + [gene]
             res_dict['impact'] = res_dict['impact'] + [csqn]
             res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]
 
-            # summarize in a dictionary
-
-            df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}
-
             for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:
 
                 if df_dict[suffix].shape[0] == 0:
@@ -308,7 +284,8 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
 @click.option('--sample')
 @click.option('--somatic-mutations-file')
 @click.option('--omega-file')
-def run_all(sample, somatic_mutations_file, omega_file):
+@click.option('--recoded-genes', default='', type=str)
+def run_all(sample, somatic_mutations_file, omega_file, recoded_genes):
     """
     intent:
     script that combines the 3 covered epithelium functions,
@@ -317,10 +294,12 @@ def run_all(sample, somatic_mutations_file, omega_file):
     nextflow module
 
     """
+    # recodification of csqn type is only applied to SNVs
+    recoded_genes = recoded_genes.split(',')
 
-    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file)
+    df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)
     df_indel_am = indel_am(sample, somatic_mutations_file)
-    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file)
+    df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)
 
     all_dataframes = [df_snv_am, df_indel_am, df_snv_nd]
     df_merged = reduce(lambda  left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes)

diff --git a/bin/mutgenomes_summary_tables.py b/bin/mutgenomes_summary_tables.py
@@ -50,7 +50,7 @@ def gather(metadata_file):
 
     for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
         for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:
-            
+
             df_all_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated[f'{label}_{k}'].values
 
             df_all_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated.apply(
@@ -80,13 +80,13 @@ def gather(metadata_file):
 
     for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
         for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:
-        
+
             df_gene_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated[f'{label}_{k}'].values
 
             df_gene_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated.apply(
                 lambda r: r[f'{label}_{k}'] if ((r['chr'] == 'chrX') and (r['SEX'] == 'M')) else 2 * r[f'{label}_{k}'],
                 axis=1)
-    
+
     # indels
 
     label = 'GENOMES_INDEL_AM'

diff --git a/conf/modules.config b/conf/modules.config
@@ -486,17 +486,18 @@ process {
     if (params.mutated_cells_vaf) {
         if (params.all_duplex_counts){
             withName: 'SUBSETMUTEPIVAFAM' {
-                ext.filters     = { ['"canonical_Protein_affecting": "protein_affecting"'].join(',\t').trim() }
                 ext.output_fmt  = { ['"header": true',
                                         '"columns": ["CHROM", "canonical_SYMBOL", "canonical_Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "canonical_Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]',
                                         '"colnames": ["CHROM", "GENE", "Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]'
                                     ].join(',\t').trim()
                                 }
-
                 publishDir       = [
                         enabled : false
                 ]
             }
+            withName: 'MUTATEDGENOMESFROMVAFAM' {
+                ext.recode_list     = "${params.mutepi_genes_to_recode}"// "TERTpromoter"
+            }
         }
 
     }

diff --git a/modules/local/mutated_genomes_from_vaf/main.nf b/modules/local/mutated_genomes_from_vaf/main.nf
@@ -17,11 +17,13 @@ process MUTATED_GENOMES_FROM_VAF {
 
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def recode = task.ext.recode_list ? "--recoded-genes ${task.ext.recode_list}" : ""
     """
     mutgenomes_driver_priority.py \\
                 --sample ${prefix} \\
                 --somatic-mutations-file ${mutations} \\
-                --omega-file ${omegas} ;
+                --omega-file ${omegas} \\
+                ${recode} ;
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -53,6 +53,7 @@ params {
     signatures                  = false
     mutationrate                = false
     mutated_cells_vaf           = false
+    mutepi_genes_to_recode      = null
     expected_mutated_cells      = false
     dnds                        = false
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -319,9 +319,14 @@
                 },
                 "mutated_cells_vaf": {
                     "type": "boolean",
-                    "description": "Do you want to run mutated_cells_reads using the VAFs?",
+                    "description": "Do you want to compute the proportion of mutated cells using the VAFs?",
                     "fa_icon": "fas fa-book"
                 },
+                "mutepi_genes_to_recode": {
+                    "type": "string",
+                    "description": "Do you want to consider all mutations in a gene for the mutated epithelium from VAF?",
+                    "fa_icon": "fas fa-book"
+                },                
                 "expected_mutated_cells": {
                     "type": "boolean",
                     "description": "Do you want to run expected_mutated_cells?",