Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 9 additions & 40 deletions bin/filter_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,26 @@

maf_df_file = sys.argv[1]
samp_name = sys.argv[2]
repetitive_variant_treshold = int(sys.argv[3])
repetitive_variant_threshold = int(sys.argv[3])
somatic_vaf_boundary = float(sys.argv[4])
n_rich_cohort_proportion = float(sys.argv[5])

maf_df = pd.read_csv(maf_df_file, compression='gzip', header = 0, sep='\t', na_values = custom_na_values)

sequenced_genes = list(pd.unique(maf_df["SYMBOL"]))



# def correct_vaf(maf):

# """
# Computes VAF_CORRECTED for the subset of variants satisfying 0 < VAF < 0.2
# Returns the input MAF with two new columns:
# VAF_CORRECTED with new corrected VAF else it copies the VAF
# IS_VAF_CORRECTED with a boolean that indicates whether the VAF has been corrected
# """

# # TODO revise the 0.2 VAF threshold to see if it can be kept across datasets
# df = maf[(0 < maf['VAF']) & (maf['VAF'] < 0.2)][['SAMPLE_ID', 'MUT_ID', 'VAF', 'DEPTH']]
# df = df.sort_values('DEPTH')
# N = df.shape[0]
# df['VAF_ROLLING_MEAN'] = df['VAF'].rolling(N // 25).mean()
# df['VAF_ROLLING_STD'] = df['VAF'].rolling(N // 25).std()
# stable_mean = df['VAF_ROLLING_MEAN'].values[-1]
# stable_std = df['VAF_ROLLING_STD'].values[-1]
# df['VAF_CORRECTED'] = df.apply(lambda r: (r['VAF'] - r['VAF_ROLLING_MEAN']) * (stable_std / r['VAF_ROLLING_STD']) + stable_mean, axis=1)
# df = maf.merge(df[['VAF_CORRECTED', 'MUT_ID', 'SAMPLE_ID']],
# on=['MUT_ID', 'SAMPLE_ID'],
# how='outer')
# df['IS_VAF_CORRECTED'] = ~df['VAF_CORRECTED'].isnull()
# df.loc[~df['IS_VAF_CORRECTED'], 'VAF_CORRECTED'] = df[~df['IS_VAF_CORRECTED']]['VAF'].values
# return df



# #######
# ### Add a corrected VAF column
# #######
# maf_df = correct_vaf(maf_df)
# print("VAF corrected")




#######
### Filter repetitive variants
#######

# TODO revise these numbers, the repetitive_variant_treshold is the boundary at which we start considering a mutation as "repetitive"
# TODO revise these numbers, the repetitive_variant_threshold is the boundary at which we start considering a mutation as "repetitive"
max_samples = len(pd.unique(maf_df["SAMPLE_ID"]))

n_samples = list(range(repetitive_variant_treshold, max_samples + 1))
n_samples = list(range(repetitive_variant_threshold, max_samples + 1))
if len(n_samples) == 0:
print("Not enough samples to identify potential repetitive variants!")

Expand All @@ -73,7 +40,7 @@
maf_df_f_somatic["count"] = 1
maf_df_f_somatic_pivot = maf_df_f_somatic.groupby("MUT_ID")["count"].sum().reset_index()

repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_treshold]["MUT_ID"]
repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_threshold]["MUT_ID"]

maf_df["repetitive_variant"] = maf_df["MUT_ID"].isin(repetitive_variants)

Expand All @@ -95,6 +62,8 @@
print("Not enough samples to identify cohort_n_rich mutations!")

else:
number_of_samples = max(2, (max_samples * n_rich_cohort_proportion) // 1)
print(f"flagging mutations that are n_rich in at least: {number_of_samples} samples as cohort_n_rich")

# work with already filtered df + somatic only to explore potential artifacts
# take only variant and sample info from the df
Expand All @@ -105,7 +74,7 @@
].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min})
n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns')

n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= 2].index)
n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['frequency'] >= number_of_samples].index)

maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars)

Expand All @@ -132,7 +101,7 @@
maf_df['frequency'] = maf_df['frequency'].fillna(0)
maf_df['VAF_Ns_threshold'] = maf_df['VAF_Ns_threshold'].fillna(1.1)

maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] > maf_df['VAF_Ns_threshold']
maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] >= maf_df['VAF_Ns_threshold']

maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_threshold"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_threshold"], "cohort_n_rich_threshold"),
axis = 1
Expand Down
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ process {
['"VAF_AM" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
'"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support", "notcontains low_mappability" ]',
'"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]',
'"VAF_distorted_expanded_sq" : false',
].join(',\t').trim()
}
Expand All @@ -336,7 +336,7 @@ process {
ext.filters = { [['"VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
['"vd_VAF" : "le ', "${params.germline_threshold}", '"'].join('').trim(),
['"DEPTH" : "ge ', "${params.mutation_depth_threshold}", '"'].join('').trim(),
'"FILTER" : ["notcontains NM20", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains no_pileup_support", "notcontains low_mappability" ]',
'"FILTER" : ["notcontains NM20", "notcontains p8", "notcontains n_rich", "notcontains cohort_n_rich_threshold", "notcontains cohort_n_rich", "notcontains no_pileup_support", "notcontains low_mappability", "notcontains not_covered" ]',
'"VAF_distorted_expanded_sq" : false',
].join(',\t').trim()
}
Expand Down
6 changes: 2 additions & 4 deletions modules/local/filtermaf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ process FILTER_BATCH {
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ""
def prefix = task.ext.prefix ?: "${meta.id}"
def repetitive_variant = task.ext.repetitive_variant ?: "${params.repetitive_variant_thres}"
def germline_threshold = task.ext.germline_threshold ?: "${params.germline_threshold}"
def proportion_samples_nrich = task.ext.prop_samples_nrich ?: "${params.prop_samples_nrich}"
"""
filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold}
filter_cohort.py ${maf} ${prefix} ${repetitive_variant} ${germline_threshold} ${proportion_samples_nrich}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -34,8 +34,6 @@ process FILTER_BATCH {
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch all_samples.cohort.filtered.tsv.gz

Expand Down
4 changes: 4 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ params {
custom_groups_file = null
custom_groups_separator = 'tab'

customize_annotation = false
custom_annotation_tsv = ''

use_custom_bedfile = false
custom_bedfile = null
use_custom_minimum_depth = 0
Expand Down Expand Up @@ -76,6 +79,7 @@ params {
mutation_depth_threshold = 40

repetitive_variant_thres = 5
prop_samples_nrich = 0.1

cosmic_ref_signatures = "COSMIC_v3.4_SBS_GRCh38.txt"
wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
Expand Down
38 changes: 38 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,30 @@
}
}
},
"custom_annotation_configs": {
"title": "Custom annotation configurations",
"type": "object",
"fa_icon": "fas fa-dna",
"description": "BED-like file to customize the annotation of variants falling in certain regions.",
"properties": {
"customize_annotation": {
"type": "boolean",
"description": "Do you want to use a custom BED file?",
"fa_icon": "fas fa-book",
"help_text": "Do you want to use a custom BED file?"
},
"custom_annotation_tsv": {
"type": "string",
"format": "file-path",
"exists": true,
"mimetype": "text/plain",
"default": null,
"description": "TSV file with the following columns: chromosome start end gene_name impactful_mutations neutral_impact new_impact",
"help_text": "TSV file with the following columns: chromosome start end gene_name impactful_mutations neutral_impact new_impact",
"fa_icon": "far fa-file-code"
}
}
},
"hotspot_bedfile_configs": {
"title": "Definition of hotspot regions configuration",
"type": "object",
Expand Down Expand Up @@ -425,6 +449,13 @@
"default": 5,
"fa_icon": "fas fa-book",
"help_text": ""
},
"prop_samples_nrich": {
"type": "number",
"description": "Any variant that is n_rich in this proportion of samples is flagged as cohort_n_rich variant.",
"default": 0.1,
"fa_icon": "fas fa-book",
"help_text": ""
}
}
},
Expand Down Expand Up @@ -545,6 +576,13 @@
"help_text": "",
"default": "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade"
},
"vep_params_panel": {
"type": "string",
"description": "Additional parameters for running Ensembl VEP for the panel annotation.",
"fa_icon": "fas fa-book",
"help_text": "",
"default": "--no_stats --cache --offline --symbol --protein --canonical"
},
"download_cache": {
"type": "boolean",
"description": "Do you want to download the cache? Activate it only if necessary.",
Expand Down