Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 48 additions & 69 deletions bin/mutgenomes_driver_priority.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from utils import inclusion_exclusion
from utils_impacts import broadimpact_grouping_dict


def compute_prior(alt_read_count, depth):

mean_p = (1 + alt_read_count.sum()) / depth.sum()
Expand Down Expand Up @@ -72,13 +73,25 @@ def compute_covered_genomes(alt_read_count, depth):
return res


def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
def recode_csqn_type(df, gene, new_csqn_type):

df['Consequence_broader'] = df.apply(lambda s: \
new_csqn_type if s['GENE'] == gene \
else s['Consequence_broader'],
axis=1)
return df
def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):

# parse mutations
somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])

# recode csqn type
for gene in recoded_genes:
somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())

mutations = somatic_mutations[(somatic_mutations['TYPE'] == 'SNV')
& (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
].reset_index(drop = True)
Expand All @@ -103,47 +116,26 @@ def snv_am(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense

for csqn in chosen_impacts:

# total
df_dict = {}

df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'] == csqn)]
df_all = df_all.sort_values(by=['VAF_AM'], ascending=False)
df_all.reset_index(drop=True, inplace=True)
df_dict['TOTAL'] = df_all

# upper

ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
try:
assert(ind >= 1)
df_upper = df_all.loc[:ind - 1].copy()
except:
df_upper = pd.DataFrame(columns=df_all.columns)

# mean

ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
try:
assert(ind >= 1)
df_mean = df_all.loc[:ind - 1].copy()
except:
df_mean = pd.DataFrame(columns=df_all.columns)

# lower

ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
try:
assert(ind >= 1)
df_lower = df_all.loc[:ind - 1].copy()
except:
df_lower = pd.DataFrame(columns=df_all.columns)
for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
try:
assert(ind >= 1)
dg = df_all.loc[:ind - 1].copy()
except:
dg = pd.DataFrame(columns=df_all.columns)
df_dict[bound] = dg

res_dict['gene'] = res_dict['gene'] + [gene]
res_dict['impact'] = res_dict['impact'] + [csqn]
res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]

# summarize in a dictionary

df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}

for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:

if df_dict[suffix].shape[0] == 0:
Expand Down Expand Up @@ -209,18 +201,23 @@ def indel_am(sample, somatic_mutations_file):
return df[['SAMPLE'] + all_columnss]


def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating']):
def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense', 'truncating'], recoded_genes=[]):

# parse mutations
somatic_mutations = pd.read_csv(somatic_mutations_file, sep='\t', low_memory=False)
consequences_of_interest = [ broadimpact_grouping_dict[csqn] for csqn in chosen_impacts]
consequences_of_interest = set([item for sublist in consequences_of_interest for item in sublist])

# recode csqn type
for gene in recoded_genes:
somatic_mutations = recode_csqn_type(somatic_mutations, gene, 'missense')
print(gene, somatic_mutations[somatic_mutations['GENE'] == gene]['Consequence_broader'].unique())

mutations = somatic_mutations[
(somatic_mutations['TYPE'] == 'SNV')
& (somatic_mutations['Consequence_broader'].isin(consequences_of_interest))
].reset_index(drop = True)

gene_chr_dict = { x : y for x, y in somatic_mutations[['GENE', 'CHROM']].drop_duplicates().values }

mutations = mutations[mutations['ALT_DEPTH_ND'] > 0]
Expand All @@ -244,47 +241,26 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense

for csqn in chosen_impacts:

# total
df_dict = {}

df_all = mutations[(mutations['GENE'] == gene) & (mutations['Consequence_broader'].isin(broadimpact_grouping_dict[csqn]))]
df_all = df_all.sort_values(by=['VAF_ND'], ascending=False)
df_all.reset_index(drop=True, inplace=True)
df_dict['TOTAL'] = df_all

# upper

ind = int(excess_dict[gene][csqn][2] * df_all.shape[0])
try:
assert(ind >= 1)
df_upper = df_all.loc[:ind - 1].copy()
except:
df_upper = pd.DataFrame(columns=df_all.columns)

# mean

ind = int(excess_dict[gene][csqn][1] * df_all.shape[0])
try:
assert(ind >= 1)
df_mean = df_all.loc[:ind - 1].copy()
except:
df_mean = pd.DataFrame(columns=df_all.columns)

# lower

ind = int(excess_dict[gene][csqn][0] * df_all.shape[0])
try:
assert(ind >= 1)
df_lower = df_all.loc[:ind - 1].copy()
except:
df_lower = pd.DataFrame(columns=df_all.columns)
for i, bound in enumerate(['LOWER', 'MEAN', 'UPPER']):
ind = int(excess_dict[gene][csqn][i] * df_all.shape[0])
try:
assert(ind >= 1)
dg = df_all.loc[:ind - 1].copy()
except:
dg = pd.DataFrame(columns=df_all.columns)
df_dict[bound] = dg

res_dict['gene'] = res_dict['gene'] + [gene]
res_dict['impact'] = res_dict['impact'] + [csqn]
res_dict['chr'] = res_dict['chr'] + [gene_chr_dict[gene]]

# summarize in a dictionary

df_dict = {'TOTAL': df_all, 'UPPER': df_upper, 'MEAN': df_mean, 'LOWER': df_lower}

for suffix in ['TOTAL', 'UPPER', 'MEAN', 'LOWER']:

if df_dict[suffix].shape[0] == 0:
Expand All @@ -308,7 +284,8 @@ def snv_nd(sample, somatic_mutations_file, omega_file, chosen_impacts=['missense
@click.option('--sample')
@click.option('--somatic-mutations-file')
@click.option('--omega-file')
def run_all(sample, somatic_mutations_file, omega_file):
@click.option('--recoded-genes', default='', type=str)
def run_all(sample, somatic_mutations_file, omega_file, recoded_genes):
"""
intent:
script that combines the 3 covered epithelium functions,
Expand All @@ -317,10 +294,12 @@ def run_all(sample, somatic_mutations_file, omega_file):
nextflow module

"""
# recodification of csqn type is only applied to SNVs
recoded_genes = recoded_genes.split(',')

df_snv_am = snv_am(sample, somatic_mutations_file, omega_file)
df_snv_am = snv_am(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)
df_indel_am = indel_am(sample, somatic_mutations_file)
df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file)
df_snv_nd = snv_nd(sample, somatic_mutations_file, omega_file, recoded_genes= recoded_genes)

all_dataframes = [df_snv_am, df_indel_am, df_snv_nd]
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['chr', 'gene', 'impact', 'SAMPLE'], how='outer'), all_dataframes)
Expand Down
6 changes: 3 additions & 3 deletions bin/mutgenomes_summary_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def gather(metadata_file):

for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:

df_all_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated[f'{label}_{k}'].values

df_all_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_all_annotated.apply(
Expand Down Expand Up @@ -80,13 +80,13 @@ def gather(metadata_file):

for label in ['GENOMES_SNV_AM', 'GENOMES_SNV_ND']:
for k in ['LOWER', 'MEAN', 'UPPER', 'TOTAL']:

df_gene_annotated[f'CELLS_DOUBLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated[f'{label}_{k}'].values

df_gene_annotated[f'CELLS_SINGLE_HIT_{'_'.join(label.split('_')[1:])}_{k}'] = df_gene_annotated.apply(
lambda r: r[f'{label}_{k}'] if ((r['chr'] == 'chrX') and (r['SEX'] == 'M')) else 2 * r[f'{label}_{k}'],
axis=1)

# indels

label = 'GENOMES_INDEL_AM'
Expand Down
5 changes: 3 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -486,17 +486,18 @@ process {
if (params.mutated_cells_vaf) {
if (params.all_duplex_counts){
withName: 'SUBSETMUTEPIVAFAM' {
ext.filters = { ['"canonical_Protein_affecting": "protein_affecting"'].join(',\t').trim() }
ext.output_fmt = { ['"header": true',
'"columns": ["CHROM", "canonical_SYMBOL", "canonical_Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "canonical_Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]',
'"colnames": ["CHROM", "GENE", "Consequence_broader", "VAF_AM", "ALT_DEPTH_AM", "DEPTH_AM", "Protein_affecting", "TYPE", "VAF_ND", "ALT_DEPTH_ND", "DEPTH_ND", "VAF", "ALT_DEPTH", "DEPTH", "MUT_ID"]'
].join(',\t').trim()
}

publishDir = [
enabled : false
]
}
withName: 'MUTATEDGENOMESFROMVAFAM' {
ext.recode_list = "${params.mutepi_genes_to_recode}"// "TERTpromoter"
}
}

}
Expand Down
4 changes: 3 additions & 1 deletion modules/local/mutated_genomes_from_vaf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ process MUTATED_GENOMES_FROM_VAF {

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def recode = task.ext.recode_list ? "--recoded-genes ${task.ext.recode_list}" : ""
"""
mutgenomes_driver_priority.py \\
--sample ${prefix} \\
--somatic-mutations-file ${mutations} \\
--omega-file ${omegas} ;
--omega-file ${omegas} \\
${recode} ;

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ params {
signatures = false
mutationrate = false
mutated_cells_vaf = false
mutepi_genes_to_recode = null
expected_mutated_cells = false
dnds = false

Expand Down
7 changes: 6 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,14 @@
},
"mutated_cells_vaf": {
"type": "boolean",
"description": "Do you want to run mutated_cells_reads using the VAFs?",
"description": "Do you want to compute the proportion of mutated cells using the VAFs?",
"fa_icon": "fas fa-book"
},
"mutepi_genes_to_recode": {
"type": "string",
"description": "Do you want to consider all mutations in a gene for the mutated epithelium from VAF?",
"fa_icon": "fas fa-book"
},
"expected_mutated_cells": {
"type": "boolean",
"description": "Do you want to run expected_mutated_cells?",
Expand Down