From 8b772625ea114f1568c0a12cd9c8590a08abd130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ferran=20Mui=C3=B1os?= <70538316+koszulordie@users.noreply.github.com> Date: Tue, 18 Mar 2025 17:13:58 +0100 Subject: [PATCH 1/7] update mutated genomes from VAF module mutgenomes_driver_priority.py command-line has a new option --recoded-genes that accepts comma-separated gene names for which all consequence types will be recoded to "missense" --- modules/local/mutated_genomes_from_vaf/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/mutated_genomes_from_vaf/main.nf b/modules/local/mutated_genomes_from_vaf/main.nf index 4971942a..71056e44 100644 --- a/modules/local/mutated_genomes_from_vaf/main.nf +++ b/modules/local/mutated_genomes_from_vaf/main.nf @@ -21,7 +21,8 @@ process MUTATED_GENOMES_FROM_VAF { mutgenomes_driver_priority.py \\ --sample ${prefix} \\ --somatic-mutations-file ${mutations} \\ - --omega-file ${omegas} ; + --omega-file ${omegas} \\ + --recoded-genes TERT ; cat <<-END_VERSIONS > versions.yml "${task.process}": From e6e5da1a7a6a3ed8423c1c8508ff9806449edb92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ferran=20Mui=C3=B1os?= <70538316+koszulordie@users.noreply.github.com> Date: Tue, 18 Mar 2025 17:28:08 +0100 Subject: [PATCH 2/7] update mutgenomes_driver_priority.py to allow csqn type recoding --- bin/mutgenomes_driver_priority.py | 186 +++++++++++++----------------- 1 file changed, 80 insertions(+), 106 deletions(-) diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py index 4dded253..d5cddae9 100755 --- a/bin/mutgenomes_driver_priority.py +++ b/bin/mutgenomes_driver_priority.py @@ -12,6 +12,7 @@ from utils import inclusion_exclusion from utils_impacts import broadimpact_grouping_dict + def compute_prior(alt_read_count, depth): mean_p = (1 + alt_read_count.sum()) / depth.sum() @@ -72,21 +73,34 @@ def compute_covered_genomes(alt_read_count, depth): return res -def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']): +def recode_csqn_type(df, gene, new_csqn_type): + + df['canonical_Consequence_broader'] = df.apply(lambda s: \ + new_csqn_type if s['canonical_SYMBOL'] == gene \ + else s['canonical_Consequence_broader'], + axis=1) + return df + + +def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) - consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts] - consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist]) - + + # recode csqn type + for gene in recoded_genes: + somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense') + print(gene, somatic_mutations[somatic_mutations['canonical_SYMBOL'] == gene]['canonical_Consequence_broader'].unique()) + mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV') - & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest)) + & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense'])) ].reset_index(drop = True) + mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1) - gene_chr_dict = {x: y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values} + gene_chr_dict = {x: y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values} # parse dN/dS excess - excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts) + excess_dict = compute_excess(omega_file) genes_with_omega = list(excess_dict.keys()) # select mutations @@ -97,53 +111,32 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: res_dict[f'GENOMES_SNV_AM_{suffix}'] = [] - genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique()))) + genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique()))) for gene in genes_with_mutations_n_omega: - for csqn in chosen_impacts: + for csqn in ['missense', 'nonsense']: - # total + df_dict = {} - df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))] + df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)] df_all = df_all.sort_values(by=['VAF_AM'], ascending=False) df_all.reset_index(drop=True, inplace=True) + df_dict['TOTAL'] = df_all - # upper - - ind = int(excess_dict[gene][csqn][2] * df_all.shape[0]) - try: - assert(ind >= 1) - df_upper = df_all.loc[:ind - 1].copy() - except: - df_upper = pd.DataFrame(columns=df_all.columns) - - # mean - - ind = int(excess_dict[gene][csqn][1] * df_all.shape[0]) - try: - assert(ind >= 1) - df_mean = df_all.loc[:ind - 1].copy() - except: - df_mean = pd.DataFrame(columns=df_all.columns) - - # lower - - ind = int(excess_dict[gene][csqn][0] * df_all.shape[0]) - try: - assert(ind >= 1) - df_lower = df_all.loc[:ind - 1].copy() - except: - df_lower = pd.DataFrame(columns=df_all.columns) + for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']): + ind = int(excess_dict[gene][csqn][i] * df_all.shape[0]) + try: + assert(ind >= 1) + dg = df_all.loc[:ind - 1].copy() + except: + dg = pd.DataFrame(columns=df_all.columns) + df_dict[bound] = dg res_dict['gene'] = res_dict['gene'] + [gene] res_dict['impact'] = res_dict['impact'] + [csqn] res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]] - # summarize in a dictionary - - df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower} - for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: if df_dict[suffix].shape[0] == 0: @@ -156,17 +149,17 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense res_dict[f'GENOMES_SNV_AM_{suffix}'] = res_dict[f'GENOMES_SNV_AM_{suffix}'] + [res['GENOMES']] df = pd.DataFrame(res_dict) - all_columnss = list(df.columns) df['SAMPLE'] = sample - return df[['SAMPLE'] + all_columnss] - + return df + def indel_am(sample, somatic_mutations_file): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) mutations = somatic_mutations[(somatic_mutations['TYPE'].isin(['INSERTION', 'DELETION']))].reset_index(drop = True) - gene_chr_dict = {x: y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values} + mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1) + gene_chr_dict = {x: y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values} # select mutations @@ -175,8 +168,8 @@ def indel_am(sample, somatic_mutations_file): res_dict['chr'] = [] res_dict['impact'] = [] res_dict['GENOMES_INDEL_AM_TOTAL'] = [] - - for gene in mutations['GENE'].unique(): + + for gene in mutations['canonical_SYMBOL'].unique(): res_dict['gene'] = res_dict['gene'] + [gene] res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]] @@ -184,7 +177,7 @@ def indel_am(sample, somatic_mutations_file): # total - df_all = mutations[(mutations['GENE'] == gene)] + df_all = mutations[(mutations['canonical_SYMBOL'] == gene)] df_all = df_all.sort_values(by=['VAF_AM'], ascending=False) df_all.reset_index(drop=True, inplace=True) @@ -193,40 +186,39 @@ def indel_am(sample, somatic_mutations_file): df_dict = {'TOTAL': df_all} if df_dict['TOTAL'].shape[0] == 0: - res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [0.] - else: - depth = df_dict['TOTAL']['DEPTH_AM'].astype(np.float32).values alt_read_count = df_dict['TOTAL']['ALT_DEPTH_AM'].astype(np.float32).values res = compute_covered_genomes(alt_read_count, depth) res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [res['GENOMES']] df = pd.DataFrame(res_dict) - all_columnss = list(df.columns) df['SAMPLE'] = sample - return df[['SAMPLE'] + all_columnss] + return df + - -def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']): +def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) - consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts] - consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist]) + + # recode csqn type + for gene in recoded_genes: + somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense') mutations = somatic_mutations[ (somatic_mutations['TYPE'] == 'SNV') - & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest)) + & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense'])) ].reset_index(drop = True) + mutations['VAF_ND'] = mutations.apply(lambda r: r['ALT_DEPTH_ND'] / r['DEPTH_ND'], axis=1) - gene_chr_dict = { x : y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values } + gene_chr_dict = { x : y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values } mutations = mutations[mutations['ALT_DEPTH_ND'] > 0] # parse dN/dS excess - excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts) + excess_dict = compute_excess(omega_file) genes_with_omega = list(excess_dict.keys()) # select mutations @@ -238,53 +230,32 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: res_dict[f'GENOMES_SNV_ND_{suffix}'] = [] - genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique()))) - + genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique()))) + for gene in genes_with_mutations_n_omega: - for csqn in chosen_impacts: + for csqn in ['missense', 'nonsense']: - # total + df_dict = {} - df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))] + df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)] df_all = df_all.sort_values(by=['VAF_ND'], ascending=False) df_all.reset_index(drop=True, inplace=True) + df_dict['TOTAL'] = df_all - # upper - - ind = int(excess_dict[gene][csqn][2] * df_all.shape[0]) - try: - assert(ind >= 1) - df_upper = df_all.loc[:ind - 1].copy() - except: - df_upper = pd.DataFrame(columns=df_all.columns) - - # mean - - ind = int(excess_dict[gene][csqn][1] * df_all.shape[0]) - try: - assert(ind >= 1) - df_mean = df_all.loc[:ind - 1].copy() - except: - df_mean = pd.DataFrame(columns=df_all.columns) - - # lower - - ind = int(excess_dict[gene][csqn][0] * df_all.shape[0]) - try: - assert(ind >= 1) - df_lower = df_all.loc[:ind - 1].copy() - except: - df_lower = pd.DataFrame(columns=df_all.columns) + for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']): + ind = int(excess_dict[gene][csqn][i] * df_all.shape[0]) + try: + assert(ind >= 1) + dg = df_all.loc[:ind - 1].copy() + except: + dg = pd.DataFrame(columns=df_all.columns) + df_dict[bound] = dg res_dict['gene'] = res_dict['gene'] + [gene] res_dict['impact'] = res_dict['impact'] + [csqn] res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]] - # summarize in a dictionary - - df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower} - for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: if df_dict[suffix].shape[0] == 0: @@ -299,28 +270,31 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense res_dict[f'GENOMES_SNV_ND_{suffix}'] = res_dict[f'GENOMES_SNV_ND_{suffix}'] + [res['GENOMES']] df = pd.DataFrame(res_dict) - all_columnss = list(df.columns) df['SAMPLE'] = sample - return df[['SAMPLE'] + all_columnss] - + return df + @click.command() @click.option('--sample') @click.option('--somatic-mutations-file') @click.option('--omega-file') -def run_all(sample, somatic_mutations_file, omega_file): +@click.option('--recoded-genes', default='', type=str) +def run_all(sample, somatic_mutations_file, omega_file, recoded_genes): """ - intent: - script that combines the 3 covered epithelium functions, - merges the output in a standardized way, so that we just - get one output in the mutated_genomes_from_vaf/main.nf + intent: + script that combines the 3 covered epithelium functions, + merges the output in a standardized way, so that we just + get one output in the mutated_genomes_from_vaf/main.nf nextflow module - + """ - df_snv_am = snv_am(sample, somatic_mutations_file, omega_file) + # recodification of csqn type is only applied to SNVs + recoded_genes = recoded_genes.split(',') + + df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes) df_indel_am = indel_am(sample, somatic_mutations_file) - df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file) + df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes) all_dataframes = [df_snv_am, df_indel_am, df_snv_nd] df_merged = reduce(lambda left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes) From e9297b3ca9ddce522dee7d13036c9056ff555051 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 19 Mar 2025 20:38:05 +0100 Subject: [PATCH 3/7] add nextflow handling of recoding genes - not tested --- modules/local/mutated_genomes_from_vaf/main.nf | 3 ++- nextflow.config | 1 + nextflow_schema.json | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/local/mutated_genomes_from_vaf/main.nf b/modules/local/mutated_genomes_from_vaf/main.nf index 71056e44..703a1b15 100644 --- a/modules/local/mutated_genomes_from_vaf/main.nf +++ b/modules/local/mutated_genomes_from_vaf/main.nf @@ -17,12 +17,13 @@ process MUTATED_GENOMES_FROM_VAF { script: def prefix = task.ext.prefix ?: "${meta.id}" + def recode = task.ext.recode_list ? "--recoded-genes ${task.ext.recode_list}" : "" """ mutgenomes_driver_priority.py \\ --sample ${prefix} \\ --somatic-mutations-file ${mutations} \\ --omega-file ${omegas} \\ - --recoded-genes TERT ; + ${recode} ; cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 397867d0..0ed82967 100644 --- a/nextflow.config +++ b/nextflow.config @@ -53,6 +53,7 @@ params { signatures = false mutationrate = false mutated_cells_vaf = false + mutepi_genes_to_recode = null expected_mutated_cells = false dnds = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 689ea911..59d2451c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -319,9 +319,14 @@ }, "mutated_cells_vaf": { "type": "boolean", - "description": "Do you want to run mutated_cells_reads using the VAFs?", + "description": "Do you want to compute the proportion of mutated cells using the VAFs?", "fa_icon": "fas fa-book" }, + "mutepi_genes_to_recode": { + "type": "string", + "description": "Do you want to consider all mutations in a gene for the mutated epithelium from VAF?", + "fa_icon": "fas fa-book" + }, "expected_mutated_cells": { "type": "boolean", "description": "Do you want to run expected_mutated_cells?", From e6eec4972b7f6a5c6d3cfb0b29691a228bcbea24 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Wed, 19 Mar 2025 20:43:44 +0100 Subject: [PATCH 4/7] fix missing param definition in modules.config --- conf/modules.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index f44daad1..df29d849 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -496,6 +496,9 @@ process { publishDir = [ enabled : false ] + } + withName: 'MUTATEDGENOMESFROMVAFAM' { + ext.recode = "${params.mutepi_genes_to_recode}"// "TERTpromoter" } } From 1467e497b73bdeb5980ffcd2972d37d8177a1884 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 20 Mar 2025 10:17:02 +0100 Subject: [PATCH 5/7] update driver priority code --- bin/mutgenomes_driver_priority.py | 93 ++++++++++++++++--------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py index d5cddae9..4b726d8e 100755 --- a/bin/mutgenomes_driver_priority.py +++ b/bin/mutgenomes_driver_priority.py @@ -75,32 +75,31 @@ def compute_covered_genomes(alt_read_count, depth): def recode_csqn_type(df, gene, new_csqn_type): - df['canonical_Consequence_broader'] = df.apply(lambda s: \ - new_csqn_type if s['canonical_SYMBOL'] == gene \ - else s['canonical_Consequence_broader'], + df['Consequence_broader'] = df.apply(lambda s: \ + new_csqn_type if s['GENE'] == gene \ + else s['Consequence_broader'], axis=1) return df - - -def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]): +def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) - + consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts] + consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist]) + # recode csqn type for gene in recoded_genes: somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense') - print(gene, somatic_mutations[somatic_mutations['canonical_SYMBOL'] == gene]['canonical_Consequence_broader'].unique()) + print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique()) mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV') - & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense'])) + & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest)) ].reset_index(drop = True) - mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1) - gene_chr_dict = {x: y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values} + gene_chr_dict = {x: y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values} # parse dN/dS excess - excess_dict = compute_excess(omega_file) + excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts) genes_with_omega = list(excess_dict.keys()) # select mutations @@ -111,11 +110,11 @@ def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]): for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: res_dict[f'GENOMES_SNV_AM_{suffix}'] = [] - genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique()))) + genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique()))) for gene in genes_with_mutations_n_omega: - for csqn in ['missense', 'nonsense']: + for csqn in chosen_impacts: df_dict = {} @@ -149,17 +148,17 @@ def snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=[]): res_dict[f'GENOMES_SNV_AM_{suffix}'] = res_dict[f'GENOMES_SNV_AM_{suffix}'] + [res['GENOMES']] df = pd.DataFrame(res_dict) + all_columnss = list(df.columns) df['SAMPLE'] = sample - return df - + return df[['SAMPLE'] + all_columnss] + def indel_am(sample, somatic_mutations_file): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) mutations = somatic_mutations[(somatic_mutations['TYPE'].isin(['INSERTION', 'DELETION']))].reset_index(drop = True) - mutations['VAF_AM'] = mutations.apply(lambda r: r['ALT_DEPTH_AM'] / r['DEPTH_AM'], axis=1) - gene_chr_dict = {x: y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values} + gene_chr_dict = {x: y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values} # select mutations @@ -168,8 +167,8 @@ def indel_am(sample, somatic_mutations_file): res_dict['chr'] = [] res_dict['impact'] = [] res_dict['GENOMES_INDEL_AM_TOTAL'] = [] - - for gene in mutations['canonical_SYMBOL'].unique(): + + for gene in mutations['GENE'].unique(): res_dict['gene'] = res_dict['gene'] + [gene] res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]] @@ -177,7 +176,7 @@ def indel_am(sample, somatic_mutations_file): # total - df_all = mutations[(mutations['canonical_SYMBOL'] == gene)] + df_all = mutations[(mutations['GENE'] == gene)] df_all = df_all.sort_values(by=['VAF_AM'], ascending=False) df_all.reset_index(drop=True, inplace=True) @@ -186,39 +185,45 @@ def indel_am(sample, somatic_mutations_file): df_dict = {'TOTAL': df_all} if df_dict['TOTAL'].shape[0] == 0: + res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [0.] + else: + depth = df_dict['TOTAL']['DEPTH_AM'].astype(np.float32).values alt_read_count = df_dict['TOTAL']['ALT_DEPTH_AM'].astype(np.float32).values res = compute_covered_genomes(alt_read_count, depth) res_dict[f'GENOMES_INDEL_AM_TOTAL'] = res_dict[f'GENOMES_INDEL_AM_TOTAL'] + [res['GENOMES']] df = pd.DataFrame(res_dict) + all_columnss = list(df.columns) df['SAMPLE'] = sample - return df - + return df[['SAMPLE'] + all_columnss] + -def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]): +def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]): # parse mutations somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False) + consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts] + consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist]) # recode csqn type for gene in recoded_genes: somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense') - + print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique()) + mutations = somatic_mutations[ (somatic_mutations['TYPE'] == 'SNV') - & (somatic_mutations['canonical_Consequence_broader'].isin(['missense', 'nonsense'])) + & (somatic_mutations['Consequence_broader'].isin(consequences_of_interest)) ].reset_index(drop = True) - mutations['VAF_ND'] = mutations.apply(lambda r: r['ALT_DEPTH_ND'] / r['DEPTH_ND'], axis=1) - - gene_chr_dict = { x : y for x, y in somatic_mutations[['canonical_SYMBOL', 'CHROM']].drop_duplicates().values } + + gene_chr_dict = { x : y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values } mutations = mutations[mutations['ALT_DEPTH_ND'] > 0] # parse dN/dS excess - excess_dict = compute_excess(omega_file) + excess_dict = compute_excess(omega_file, chosen_impacts_omega = chosen_impacts) genes_with_omega = list(excess_dict.keys()) # select mutations @@ -230,15 +235,15 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]): for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']: res_dict[f'GENOMES_SNV_ND_{suffix}'] = [] - genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['canonical_SYMBOL'].unique()))) - + genes_with_mutations_n_omega = sorted(set(genes_with_omega).intersection(set(mutations['GENE'].unique()))) + for gene in genes_with_mutations_n_omega: - for csqn in ['missense', 'nonsense']: + for csqn in chosen_impacts: df_dict = {} - df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)] + df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))] df_all = df_all.sort_values(by=['VAF_ND'], ascending=False) df_all.reset_index(drop=True, inplace=True) df_dict['TOTAL'] = df_all @@ -270,9 +275,10 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]): res_dict[f'GENOMES_SNV_ND_{suffix}'] = res_dict[f'GENOMES_SNV_ND_{suffix}'] + [res['GENOMES']] df = pd.DataFrame(res_dict) + all_columnss = list(df.columns) df['SAMPLE'] = sample - return df - + return df[['SAMPLE'] + all_columnss] + @click.command() @click.option('--sample') @@ -281,20 +287,19 @@ def snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=[]): @click.option('--recoded-genes', default='', type=str) def run_all(sample, somatic_mutations_file, omega_file, recoded_genes): """ - intent: - script that combines the 3 covered epithelium functions, - merges the output in a standardized way, so that we just - get one output in the mutated_genomes_from_vaf/main.nf + intent: + script that combines the 3 covered epithelium functions, + merges the output in a standardized way, so that we just + get one output in the mutated_genomes_from_vaf/main.nf nextflow module - - """ + """ # recodification of csqn type is only applied to SNVs recoded_genes = recoded_genes.split(',') - df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes) + df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes) df_indel_am = indel_am(sample, somatic_mutations_file) - df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes=recoded_genes) + df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes) all_dataframes = [df_snv_am, df_indel_am, df_snv_nd] df_merged = reduce(lambda left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes) From b64c7506de5484a14c9b3cc57776b6a1fd502628 Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 20 Mar 2025 10:29:30 +0100 Subject: [PATCH 6/7] fix missing update -not tested --- bin/mutgenomes_driver_priority.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mutgenomes_driver_priority.py b/bin/mutgenomes_driver_priority.py index 4b726d8e..11e04b99 100755 --- a/bin/mutgenomes_driver_priority.py +++ b/bin/mutgenomes_driver_priority.py @@ -118,7 +118,7 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense df_dict = {} - df_all = mutations[(mutations['canonical_SYMBOL'] == gene) & (mutations['canonical_Consequence_broader'] == csqn)] + df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'] == csqn)] df_all = df_all.sort_values(by=['VAF_AM'], ascending=False) df_all.reset_index(drop=True, inplace=True) df_dict['TOTAL'] = df_all From ddaa08778e1ef4cdf573f0bae1d08e6f5570f80a Mon Sep 17 00:00:00 2001 From: FerriolCalvet Date: Thu, 20 Mar 2025 16:27:25 +0100 Subject: [PATCH 7/7] update filtering in modules.config - update handling of genes to recode param --- bin/mutgenomes_summary_tables.py | 6 +++--- conf/modules.config | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bin/mutgenomes_summary_tables.py b/bin/mutgenomes_summary_tables.py index f9b8566b..db070392 100755 --- a/bin/mutgenomes_summary_tables.py +++ b/bin/mutgenomes_summary_tables.py @@ -50,7 +50,7 @@ def gather(metadata_file): for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']: for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']: - + df_all_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated[f'{label}_{k}'].values df_all_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated.apply( @@ -80,13 +80,13 @@ def gather(metadata_file): for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']: for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']: - + df_gene_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated[f'{label}_{k}'].values df_gene_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated.apply( lambda r: r[f'{label}_{k}'] if ((r['chr'] == 'chrX') and (r['SEX'] == 'M')) else 2 * r[f'{label}_{k}'], axis=1) - + # indels label = 'GENOMES_INDEL_AM' diff --git a/conf/modules.config b/conf/modules.config index df29d849..bbd598c0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -486,19 +486,17 @@ process { if (params.mutated_cells_vaf) { if (params.all_duplex_counts){ withName: 'SUBSETMUTEPIVAFAM' { - ext.filters = { ['"canonical_Protein_affecting": "protein_affecting"'].join(',\t').trim() } ext.output_fmt = { ['"header": true', '"columns": ["CHROM", "canonical_SYMBOL", "canonical_Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "canonical_Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]', '"colnames": ["CHROM", "GENE", "Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]' ].join(',\t').trim() } - publishDir = [ enabled : false ] - } + } withName: 'MUTATEDGENOMESFROMVAFAM' { - ext.recode = "${params.mutepi_genes_to_recode}"// "TERTpromoter" + ext.recode_list = "${params.mutepi_genes_to_recode}"// "TERTpromoter" } }