diff --git a/bin/add_hotspots.py b/bin/add_hotspots.py index 2554d2ce..3c1bad83 100755 --- a/bin/add_hotspots.py +++ b/bin/add_hotspots.py @@ -5,31 +5,36 @@ import json import pandas as pd import numpy as np -from utils import to_int_if_possible -from read_utils import custom_na_values + panel_file = sys.argv[1] bedfile = sys.argv[2] # BED file (optional, can be None) expand = int(sys.argv[3]) # Expansion factor for defining regions -use_bed = bool(int(sys.argv[4])) # Whether to use the BED file for exon definitions +autoexons = bool(int(sys.argv[4])) # Whether to use the BED file for exon definitions # Read input data panel_data = pd.read_table(panel_file) -if use_bed: +exons_bed = pd.DataFrame() +exons_bed_provided = pd.DataFrame() + +if bedfile != 'None': + print("Using the BED file provided") # Use BED file for exon definition - exons_bed = pd.read_table(bedfile, header=None, sep="\t") + exons_bed_provided = pd.read_table(bedfile, header=None, sep="\t") # Check if a name column is provided - if exons_bed.shape[1] > 3: - exons_bed = exons_bed.iloc[:,:4] - exons_bed.columns = ["CHROM", "START", "END", "NAME"] + if exons_bed_provided.shape[1] > 3: + exons_bed_provided = exons_bed_provided.iloc[:,:4] + exons_bed_provided.columns = ["CHROM", "START", "END", "NAME"] else: - exons_bed = exons_bed.iloc[:,:3] - exons_bed.columns = ["CHROM", "START", "END"] - exons_bed["NAME"] = None - print(exons_bed) - + exons_bed_provided = exons_bed_provided.iloc[:,:3] + exons_bed_provided.columns = ["CHROM", "START", "END"] + exons_bed_provided["NAME"] = None else: + print("NOT using any BED file") + +if autoexons: + print("Automated generation of exons from the panel") # Use breakpoints to define exons exons_bed = [] for chrom, gene_group in panel_data.groupby(["CHROM", "GENE"]): @@ -47,7 +52,15 @@ # Append to exons exons_bed.extend([{"CHROM": chrom[0], "START": s, "END": e, "NAME": None, "GENE": chrom[1]} for s, e in zip(starts, ends)]) exons_bed = pd.DataFrame(exons_bed) +else: + print("NO automated generation of exons from the panel") + + +# Concatenate the previous two BED dataframes +exons_bed = pd.concat((exons_bed, exons_bed_provided)) +if exons_bed.shape[0] == 0: + raise ValueError("No regions to extend") ## Process panel data to create new genes per exon new_data = pd.DataFrame() @@ -81,7 +94,7 @@ exon_number = exon_counters[gene] region_name = f"{gene}--exon_{exon_number}" - if use_bed: + if ("--exon_" not in region_name) and (expand > 0): # Expand within limits of the same gene only if BED file is used upd_start = max(ind_start - expand * 3, 0) while upd_start < len(chr_data) and chr_data.iloc[upd_start]["GENE"] != gene: diff --git a/bin/panels_dna2domain.py b/bin/panels_dna2domain.py new file mode 100755 index 00000000..3dccb699 --- /dev/null +++ b/bin/panels_dna2domain.py @@ -0,0 +1,92 @@ +#!/usr/local/bin/python + + +# Third-party imports +import click +import pandas as pd + + +# ---------------------- DNA and Protein Mapping ---------------------- # +def generate_domains2dna_mapping(consensus_panel_rich_file, domains_file, output_file): + """ + Main function to process DNA to protein mapping. + """ + + all_protein_positions = pd.read_table(consensus_panel_rich_file)[['CHROM', 'POS', 'STRAND', 'GENE', 'Feature', 'Protein_position']] + all_protein_positions = all_protein_positions[all_protein_positions["Protein_position"] != "-"].drop_duplicates() + all_protein_positions["Protein_position"] = all_protein_positions["Protein_position"].astype(float) + all_protein_positions_summary = all_protein_positions[~(all_protein_positions["Protein_position"].isna()) + ][['GENE', 'Protein_position', "CHROM", "POS", "STRAND"]].reset_index(drop = True) + + panel_all_genes = all_protein_positions["Feature"].unique() + + #pfam_domains = pd.read_table("/data/bbg/nobackup/scratch/oncodrive3d/annotations_mane_240506/pfam.tsv") + pfam_domains = pd.read_table(domains_file) + pfam_domains["NAME"] = pfam_domains["Gene"] + '--' + pfam_domains["Description"].str.replace("-", "_") + '-' + pfam_domains["Ens_Transcr_ID"] + + pfam_domains_summary = pfam_domains[['Gene', 'Ens_Transcr_ID', 'Begin', 'End','NAME']] + pfam_domains_summary = pfam_domains_summary[pfam_domains_summary["Ens_Transcr_ID"].isin(panel_all_genes)].reset_index(drop = True) + + + + all_protein_positions_small_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"], + ascending = True).drop_duplicates( + subset = ['GENE', 'Protein_position'], + keep = 'first') + all_protein_positions_small_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'START_fw', "STRAND"] + all_protein_positions_big_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"], + ascending = True).drop_duplicates( + subset = ['GENE', 'Protein_position'], + keep = 'last') + all_protein_positions_big_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'END_fw', "STRAND"] + + + + pfam_domains_genomic = pfam_domains_summary.merge(all_protein_positions_small_summary, + left_on = ['Gene', 'Begin'], + right_on = ["GENE", 'PROT_POS'], + how = 'left').merge(all_protein_positions_big_summary, + left_on = ['GENE', 'End', "CHROM", "STRAND"], + right_on = ["GENE", 'PROT_POS', "CHROM", "STRAND"], + how = 'left') + pfam_domains_genomic_summary = pfam_domains_genomic[['CHROM', 'START_fw', 'END_fw', 'NAME', "STRAND"]].copy() + pfam_domains_genomic_summary = pfam_domains_genomic_summary[ + (~(pfam_domains_genomic_summary["CHROM"].isna())) & + (~(pfam_domains_genomic_summary["START_fw"].isna())) & + (~(pfam_domains_genomic_summary["END_fw"].isna())) + ].reset_index(drop = True) + pfam_domains_genomic_summary["START_fw"] = pfam_domains_genomic_summary["START_fw"].astype(int) + pfam_domains_genomic_summary["END_fw"] = pfam_domains_genomic_summary["END_fw"].astype(int) + + + + pfam_domains_genomic_summary[["START","END"]] = pfam_domains_genomic_summary[["START_fw", "END_fw", "STRAND"]].apply( + lambda x : (x["END_fw"], x["START_fw"]) if x["STRAND"] == -1 else (x["START_fw"], x["END_fw"]), + axis = 1 + ).to_list() + pfam_domains_genomic_summary = pfam_domains_genomic_summary[['CHROM', 'START', 'END', 'NAME']] + pfam_domains_genomic_summary["START"] = pfam_domains_genomic_summary["START"].astype(int) + pfam_domains_genomic_summary["END"] = pfam_domains_genomic_summary["END"].astype(int) + + pfam_domains_genomic_summary.to_csv(output_file, header = False, index = False, sep = '\t') + + + +# ---------------------- CLI Interface ---------------------- # + +@click.command() +@click.option('--consensus-panel-rich', type=click.Path(exists=True), help='Input consensus panel file') +@click.option('--domains', type=click.Path(exists=True), help='Input consensus panel file') +@click.option('--output', type=click.Path(), help='Output file') + +def main(consensus_panel_rich, domains, output): + click.echo("Mapping the domains to the genomic sequence...") + generate_domains2dna_mapping(consensus_panel_rich, domains, output) + + + +if __name__ == '__main__': + main() + + +# ---------------------- End of the script ---------------------- # diff --git a/conf/bladder.config b/conf/bladder.config index 8bf751f1..dea5426d 100644 --- a/conf/bladder.config +++ b/conf/bladder.config @@ -17,7 +17,7 @@ params { pileup_all_duplex = false store_depths = false - omega_hotspots = false + omega_withingene = false omega_hotspots_bedfile = "/data/bbg/datasets/bladder_ts/Methods/LBL_v2.20220824.only_hotspots.bed4.bed" hotspot_expansion = 30 diff --git a/conf/general_files_BBG.config b/conf/general_files_BBG.config index cf41c6e1..9d2ab0cb 100644 --- a/conf/general_files_BBG.config +++ b/conf/general_files_BBG.config @@ -8,9 +8,6 @@ params { wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv" - low_complex_file = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed" - low_mappability_file = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed" - cosmic_ref_signatures = "/workspace/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt" wgs_trinuc_counts = "/workspace/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv" cadd_scores = "/workspace/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz" diff --git a/conf/general_files_IRB.config b/conf/general_files_IRB.config index 24a3ca06..09d9cace 100644 --- a/conf/general_files_IRB.config +++ b/conf/general_files_IRB.config @@ -8,9 +8,6 @@ params { wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv" - low_complex_file = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed" - low_mappability_file = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed" - cosmic_ref_signatures = "/data/bbg/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt" wgs_trinuc_counts = "/data/bbg/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv" cadd_scores = "/data/bbg/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz" diff --git a/conf/mice.config b/conf/mice.config index 31768018..72013348 100644 --- a/conf/mice.config +++ b/conf/mice.config @@ -31,7 +31,7 @@ params { datasets3d = "/data/bbg/nobackup/scratch/oncodrive3d/datasets_mouse" // annotations3d = "/workspace/nobackup/scratch/oncodrive3d/annotations_240506" - omega_hotspots = false + omega_withingene = false omega_hotspots_bedfile = "/data/bbg/datasets/transfer/ferriol_deepcsa/mouse_skin_panel.hotspots.bed4.bed" hotspot_expansion = 15 diff --git a/conf/modules.config b/conf/modules.config index 0c72a398..d1a78bcd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -356,13 +356,13 @@ process { withName: "GROUPGENES" { ext.custom = params.custom_groups - ext.hotspots = params.omega_hotspots + ext.hotspots = params.omega_withingene ext.separator = params.custom_groups_separator } withName: 'EXPANDREGIONS' { ext.expansion = params.omega_hotspots_bedfile ? params.hotspot_expansion : 0 - ext.use_hotspot_bed = params.omega_hotspots_bedfile as Boolean + ext.using_bedfile = params.omega_autodomains | (params.omega_hotspots_bedfile as Boolean) } if (params.mutated_epithelium){ diff --git a/modules/local/expand_regions/main.nf b/modules/local/expand_regions/main.nf index 0a3416ca..284e33e3 100644 --- a/modules/local/expand_regions/main.nf +++ b/modules/local/expand_regions/main.nf @@ -3,16 +3,11 @@ process EXPAND_REGIONS { tag "$meta.id" label 'process_high' - // // conda "YOUR-TOOL-HERE" - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - // 'biocontainers/YOUR-TOOL-HERE' }" container 'docker.io/ferriolcalvet/bgreference' input: - tuple val(meta) , path(panel) + tuple val(meta), path(panel) path (bedfile) - // tuple val(meta2) , path(bedfile) output: tuple val(meta), path("*with_hotspots.tsv") , emit: panel_increased @@ -23,10 +18,15 @@ process EXPAND_REGIONS { task.ext.when == null || task.ext.when script: - def expansion = task.ext.expansion ?: 30 - def configuration = task.ext.use_hotspot_bed ? "${bedfile} ${expansion} 1" : 'None 0 0' + def expansion = task.ext.expansion ?: 0 + def bedfile_to_use = task.ext.using_bedfile ? "custom_regions_bedfiles.bed" : "None" + def autoexons = params.omega_autoexons ? "1" : "0" """ - add_hotspots.py ${panel} ${configuration} + cat ${bedfile} >> custom_regions_bedfiles.bed + add_hotspots.py ${panel} \\ + ${bedfile_to_use} \\ + ${expansion} \\ + ${autoexons} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/process_annotation/domain/main.nf b/modules/local/process_annotation/domain/main.nf new file mode 100644 index 00000000..b31eeada --- /dev/null +++ b/modules/local/process_annotation/domain/main.nf @@ -0,0 +1,44 @@ +process DOMAIN_ANNOTATION { + + tag "${meta.id}" + + label 'cpu_low' + label 'time_low' + label 'process_high_memory' + + container "docker.io/ferriolcalvet/bgreference" + + input: + tuple val(meta) , path(panel_annotated) + path (domains_file) + + output: + path("*.domains.bed4.bed") , emit: domains_bed + path "versions.yml" , topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + panels_dna2domain.py \\ + --consensus-panel-rich ${panel_annotated} \\ + --domains ${domains_file} \\ + --output seq_panel.domains.bed4.bed; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch consensus.domains.bed4.bed; + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/process_annotation/domain/meta.yml b/modules/local/process_annotation/domain/meta.yml new file mode 100644 index 00000000..e69de29b diff --git a/nextflow.config b/nextflow.config index 28a70dbe..28793615 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,9 +31,11 @@ params { hotspots_annotation = false hotspots_definition_file = '' - omega_hotspots = false + omega_withingene = false omega_hotspots_bedfile = null - hotspot_expansion = 30 + hotspot_expansion = 0 + omega_autodomains = false + omega_autoexons = false oncodrivefml = false oncodriveclustl = false @@ -45,7 +47,7 @@ params { omega = false omega_globalloc = false omega_mutabilities = false - site_comparison_grouping = 'aminoacid_change' + site_comparison_grouping = 'all' omega_vaf_distorsioned = false omega_plot = false @@ -115,12 +117,6 @@ params { vep_params = "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade" vep_params_panel = "--no_stats --cache --offline --symbol --protein --canonical" - - // BED files for additional regional annotations - low_complex_file = "hg38_lowcomplexity_repetitive_regions.mutcoords.bed" - low_mappability_file = "GRCh38.encode.blacklist.bed" - // mutagenesis regions panel - // neutral/intergenic regions panel } // Global default params, used in configs diff --git a/nextflow_schema.json b/nextflow_schema.json index 71f02255..218db4cd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -31,30 +31,6 @@ } } }, - "bed_files_options": { - "title": "BED file options", - "type": "object", - "description": "BED files used for filtering mutations or genomic regions.", - "default": "", - "properties": { - "low_mappability_file": { - "type": "string", - "format": "file-path", - "description": "BED file with the low mappability regions.", - "fa_icon": "fas fa-book", - "help_text": "BED file that contains the regions to be discarded in the analysis.", - "default": "GRCh38.encode.blacklist.bed" - }, - "low_complex_file": { - "type": "string", - "format": "file-path", - "description": "BED file with the low complex regions.", - "fa_icon": "fas fa-book", - "help_text": "BED file that contains the regions to be discarded in the analysis.", - "default": "hg38_lowcomplexity_repetitive_regions.mutcoords.bed" - } - } - }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -212,13 +188,13 @@ } } }, - "hotspot_bedfile_configs": { + "omega_bedfile_configs": { "title": "Definition of hotspot regions configuration", "type": "object", "fa_icon": "fas fa-dna", "description": "BED file dependent on the hotspots present in the sequencing panel of interest.", "properties": { - "omega_hotspots": { + "omega_withingene": { "type": "boolean", "description": "Do you want to analyze omega hotspots independently?", "fa_icon": "fas fa-book", @@ -239,6 +215,18 @@ "default": 30, "help_text": "", "fa_icon": "far fa-file-code" + }, + "omega_autodomains": { + "type": "boolean", + "description": "Do you want to automatically generate the domains for omega?", + "fa_icon": "fas fa-book", + "help_text": "Do you want to automatically generate the domains for omega?" + }, + "omega_autoexons": { + "type": "boolean", + "description": "Do you want to automatically generate the theoretical exons from the panel for omega?", + "fa_icon": "fas fa-book", + "help_text": "Do you want to automatically generate the theoretical exons from the panel for omega?" } } }, @@ -757,9 +745,6 @@ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/bed_files_options" - }, { "$ref": "#/$defs/reference_genome_options" }, @@ -773,7 +758,7 @@ "$ref": "#/$defs/custom_bedfile_configs" }, { - "$ref": "#/$defs/hotspot_bedfile_configs" + "$ref": "#/$defs/omega_bedfile_configs" }, { "$ref": "#/$defs/hotspot_annotation_options" diff --git a/subworkflows/local/createpanels/main.nf b/subworkflows/local/createpanels/main.nf index 6c1724f1..23699f47 100644 --- a/subworkflows/local/createpanels/main.nf +++ b/subworkflows/local/createpanels/main.nf @@ -6,6 +6,9 @@ include { POSTPROCESS_VEP_ANNOTATION as POSTPROCESSVEPPANEL } from ' include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSING } from '../../../modules/local/process_annotation/panelcustom/main' include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSINGRICH } from '../../../modules/local/process_annotation/panelcustom/main' +include { DOMAIN_ANNOTATION as DOMAINANNOTATION } from '../../../modules/local/process_annotation/domain/main' + + include { CREATECAPTUREDPANELS } from '../../../modules/local/createpanels/captured/main' include { CREATESAMPLEPANELS as CREATESAMPLEPANELSALL } from '../../../modules/local/createpanels/sample/main' @@ -86,6 +89,10 @@ workflow CREATE_PANELS { added_regions = Channel.empty() } + // Generate BED file with genomic coordinates of sequenced domains + domains_file = file("${params.annotations3d}/pfam.tsv") + DOMAINANNOTATION(rich_annotated, domains_file) + // Create captured-specific panels: all modalities CREATECAPTUREDPANELS(complete_annotated_panel) @@ -150,6 +157,7 @@ workflow CREATE_PANELS { panel_annotated_rich = rich_annotated added_custom_regions = added_regions + domains_panel_bed = DOMAINANNOTATION.out.domains_bed // all_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSALL.out.sample_specific_panel.flatten()) // all_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSALL.out.sample_specific_panel_bed.flatten()) diff --git a/subworkflows/local/omega/main.nf b/subworkflows/local/omega/main.nf index 3db2d70d..907c1cdf 100644 --- a/subworkflows/local/omega/main.nf +++ b/subworkflows/local/omega/main.nf @@ -33,13 +33,23 @@ workflow OMEGA_ANALYSIS{ bedfile panel custom_gene_groups - hotspots_file + domains_file mutationrates complete_panel main: + // Create a channel for the domains file if omega_autodomains is true + domains_ch = params.omega_autodomains ? domains_file : Channel.empty() + + // Create a channel for the hotspots bedfile if provided + hotspots_ch = params.omega_hotspots_bedfile ? Channel.fromPath(params.omega_hotspots_bedfile) : Channel.empty() + + // Combine both channels + hotspots_bed_file = domains_ch.mix(hotspots_ch).collect().ifEmpty { file(params.input) } + + // Intersect BED of all sites with BED of sample filtered sites SUBSETMUTATIONS(mutations, bedfile) @@ -58,8 +68,8 @@ workflow OMEGA_ANALYSIS{ .set{ all_samples_mut_profile } - if (params.omega_hotspots){ - EXPANDREGIONS(panel, hotspots_file) + if (params.omega_withingene){ + EXPANDREGIONS(panel, hotspots_bed_file) expanded_panel = EXPANDREGIONS.out.panel_increased.first() json_hotspots = EXPANDREGIONS.out.new_regions_json.first() } else { diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index f546268f..97134e67 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -146,12 +146,6 @@ workflow DEEPCSA{ ? Channel.fromPath( params.custom_bedfile, checkIfExists: true).first() : Channel.fromPath(params.input) - // if the user wants to define hotspots for omega, import the hotspots definition BED file - // otherwise I am using the input csv as a dummy value channel - hotspots_bed_file = params.omega_hotspots_bedfile - ? Channel.fromPath( params.omega_hotspots_bedfile, checkIfExists: true).first() - : Channel.fromPath(params.input) - // Initialize booleans based on user params def run_mutabilities = (params.oncodrivefml || params.oncodriveclustl || params.oncodrive3d) def run_mutrate = (params.mutationrate || params.omega) @@ -438,7 +432,7 @@ workflow DEEPCSA{ CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, custom_groups_table, - hotspots_bed_file, + CREATEPANELS.out.domains_panel_bed, SYNMUTRATE.out.mutrate, CREATEPANELS.out.panel_annotated_rich ) @@ -452,7 +446,7 @@ workflow DEEPCSA{ CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, custom_groups_table, - hotspots_bed_file, + CREATEPANELS.out.domains_panel_bed, SYNMUTREADSRATE.out.mutrate, CREATEPANELS.out.panel_annotated_rich ) @@ -466,7 +460,7 @@ workflow DEEPCSA{ CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, custom_groups_table, - hotspots_bed_file, + CREATEPANELS.out.domains_panel_bed, SYNMUTRATE.out.mutrate, CREATEPANELS.out.panel_annotated_rich ) @@ -477,7 +471,7 @@ workflow DEEPCSA{ CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel, custom_groups_table, - hotspots_bed_file, + CREATEPANELS.out.domains_panel_bed, SYNMUTREADSRATE.out.mutrate, CREATEPANELS.out.panel_annotated_rich )