bbglab · FerriolCalvet · Feb 25, 2025 · Feb 22, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/bin/add_hotspots.py b/bin/add_hotspots.py
@@ -5,31 +5,36 @@
 import json
 import pandas as pd
 import numpy as np
-from utils import to_int_if_possible
-from read_utils import custom_na_values
+
 
 panel_file = sys.argv[1]
 bedfile = sys.argv[2]  # BED file (optional, can be None)
 expand = int(sys.argv[3])  # Expansion factor for defining regions
-use_bed = bool(int(sys.argv[4]))  # Whether to use the BED file for exon definitions
+autoexons = bool(int(sys.argv[4]))  # Whether to use the BED file for exon definitions
 
 # Read input data
 panel_data = pd.read_table(panel_file)
 
-if use_bed:
+exons_bed = pd.DataFrame()
+exons_bed_provided = pd.DataFrame()
+
+if bedfile != 'None':
+    print("Using the BED file provided")
     # Use BED file for exon definition
-    exons_bed = pd.read_table(bedfile, header=None, sep="\t")
+    exons_bed_provided = pd.read_table(bedfile, header=None, sep="\t")
     # Check if a name column is provided
-    if exons_bed.shape[1] > 3:
-        exons_bed = exons_bed.iloc[:,:4]
-        exons_bed.columns = ["CHROM", "START", "END", "NAME"]
+    if exons_bed_provided.shape[1] > 3:
+        exons_bed_provided = exons_bed_provided.iloc[:,:4]
+        exons_bed_provided.columns = ["CHROM", "START", "END", "NAME"]
     else:
-        exons_bed = exons_bed.iloc[:,:3]
-        exons_bed.columns = ["CHROM", "START", "END"]
-        exons_bed["NAME"] = None
-    print(exons_bed)
-
+        exons_bed_provided = exons_bed_provided.iloc[:,:3]
+        exons_bed_provided.columns = ["CHROM", "START", "END"]
+        exons_bed_provided["NAME"] = None
 else:
+    print("NOT using any BED file")
+
+if autoexons:
+    print("Automated generation of exons from the panel")
     # Use breakpoints to define exons
     exons_bed = []
     for chrom, gene_group in panel_data.groupby(["CHROM", "GENE"]):
@@ -47,7 +52,15 @@
         # Append to exons
         exons_bed.extend([{"CHROM": chrom[0], "START": s, "END": e, "NAME": None, "GENE": chrom[1]} for s, e in zip(starts, ends)])
     exons_bed = pd.DataFrame(exons_bed)
+else:
+    print("NO automated generation of exons from the panel")
+
+
+# Concatenate the previous two BED dataframes
+exons_bed = pd.concat((exons_bed, exons_bed_provided))
 
+if exons_bed.shape[0] == 0:
+    raise ValueError("No regions to extend")
 
 ## Process panel data to create new genes per exon
 new_data = pd.DataFrame()
@@ -81,7 +94,7 @@
             exon_number = exon_counters[gene]
             region_name = f"{gene}--exon_{exon_number}"
 
-        if use_bed:
+        if ("--exon_" not in region_name) and (expand > 0):
             # Expand within limits of the same gene only if BED file is used
             upd_start = max(ind_start - expand * 3, 0)
             while upd_start < len(chr_data) and chr_data.iloc[upd_start]["GENE"] != gene:

diff --git a/bin/panels_dna2domain.py b/bin/panels_dna2domain.py
@@ -0,0 +1,92 @@
+#!/usr/local/bin/python
+
+
+# Third-party imports
+import click
+import pandas as pd
+
+
+# ---------------------- DNA and Protein Mapping ---------------------- #
+def generate_domains2dna_mapping(consensus_panel_rich_file, domains_file, output_file):
+    """
+    Main function to process DNA to protein mapping.
+    """
+
+    all_protein_positions = pd.read_table(consensus_panel_rich_file)[['CHROM', 'POS', 'STRAND', 'GENE', 'Feature', 'Protein_position']]
+    all_protein_positions = all_protein_positions[all_protein_positions["Protein_position"] != "-"].drop_duplicates()
+    all_protein_positions["Protein_position"] = all_protein_positions["Protein_position"].astype(float)
+    all_protein_positions_summary = all_protein_positions[~(all_protein_positions["Protein_position"].isna())
+                                                            ][['GENE', 'Protein_position', "CHROM", "POS", "STRAND"]].reset_index(drop = True)
+
+    panel_all_genes = all_protein_positions["Feature"].unique()
+
+    #pfam_domains = pd.read_table("/data/bbg/nobackup/scratch/oncodrive3d/annotations_mane_240506/pfam.tsv")
+    pfam_domains = pd.read_table(domains_file)
+    pfam_domains["NAME"] = pfam_domains["Gene"] + '--' + pfam_domains["Description"].str.replace("-", "_") + '-' + pfam_domains["Ens_Transcr_ID"]
+
+    pfam_domains_summary = pfam_domains[['Gene', 'Ens_Transcr_ID', 'Begin', 'End','NAME']]
+    pfam_domains_summary = pfam_domains_summary[pfam_domains_summary["Ens_Transcr_ID"].isin(panel_all_genes)].reset_index(drop = True)
+
+
+
+    all_protein_positions_small_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"],
+                                                                                    ascending = True).drop_duplicates(
+                                                                                                            subset = ['GENE', 'Protein_position'],
+                                                                                                            keep = 'first')
+    all_protein_positions_small_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'START_fw', "STRAND"]
+    all_protein_positions_big_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"],
+                                                                                    ascending = True).drop_duplicates(
+                                                                                                            subset = ['GENE', 'Protein_position'],
+                                                                                                            keep = 'last')
+    all_protein_positions_big_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'END_fw', "STRAND"]
+
+
+
+    pfam_domains_genomic = pfam_domains_summary.merge(all_protein_positions_small_summary,
+                                                                left_on = ['Gene', 'Begin'],
+                                                                right_on = ["GENE", 'PROT_POS'],
+                                                                how = 'left').merge(all_protein_positions_big_summary,
+                                                                left_on = ['GENE', 'End', "CHROM", "STRAND"],
+                                                                right_on = ["GENE", 'PROT_POS', "CHROM", "STRAND"],
+                                                                how = 'left')
+    pfam_domains_genomic_summary = pfam_domains_genomic[['CHROM', 'START_fw', 'END_fw', 'NAME', "STRAND"]].copy()
+    pfam_domains_genomic_summary = pfam_domains_genomic_summary[
+                                            (~(pfam_domains_genomic_summary["CHROM"].isna())) &
+                                            (~(pfam_domains_genomic_summary["START_fw"].isna())) &
+                                            (~(pfam_domains_genomic_summary["END_fw"].isna()))
+                                        ].reset_index(drop = True)
+    pfam_domains_genomic_summary["START_fw"] = pfam_domains_genomic_summary["START_fw"].astype(int)
+    pfam_domains_genomic_summary["END_fw"] = pfam_domains_genomic_summary["END_fw"].astype(int)
+
+
+
+    pfam_domains_genomic_summary[["START","END"]] =  pfam_domains_genomic_summary[["START_fw", "END_fw", "STRAND"]].apply(
+                                                                    lambda x : (x["END_fw"], x["START_fw"]) if x["STRAND"] == -1 else (x["START_fw"], x["END_fw"]),
+        axis = 1
+    ).to_list()
+    pfam_domains_genomic_summary = pfam_domains_genomic_summary[['CHROM', 'START', 'END', 'NAME']]
+    pfam_domains_genomic_summary["START"] = pfam_domains_genomic_summary["START"].astype(int)
+    pfam_domains_genomic_summary["END"] = pfam_domains_genomic_summary["END"].astype(int)
+
+    pfam_domains_genomic_summary.to_csv(output_file, header = False, index = False, sep = '\t')
+
+
+
+# ---------------------- CLI Interface ---------------------- #
+
+@click.command()
+@click.option('--consensus-panel-rich', type=click.Path(exists=True), help='Input consensus panel file')
+@click.option('--domains', type=click.Path(exists=True), help='Input consensus panel file')
+@click.option('--output', type=click.Path(), help='Output file')
+
+def main(consensus_panel_rich, domains, output):
+    click.echo("Mapping the domains to the genomic sequence...")
+    generate_domains2dna_mapping(consensus_panel_rich, domains, output)
+
+
+
+if __name__ == '__main__':
+    main()
+
+
+# ---------------------- End of the script ---------------------- #
diff --git a/conf/bladder.config b/conf/bladder.config
@@ -17,7 +17,7 @@ params {
     pileup_all_duplex          = false
     store_depths               = false
 
-    omega_hotspots              = false
+    omega_withingene              = false
     omega_hotspots_bedfile      = "/data/bbg/datasets/bladder_ts/Methods/LBL_v2.20220824.only_hotspots.bed4.bed"
     hotspot_expansion           = 30
 

diff --git a/conf/general_files_BBG.config b/conf/general_files_BBG.config
@@ -8,9 +8,6 @@ params {
 
     wgs_trinuc_counts       = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
 
-    low_complex_file        = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
-    low_mappability_file    = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed"
-
     cosmic_ref_signatures   = "/workspace/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt"
     wgs_trinuc_counts       = "/workspace/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
     cadd_scores             = "/workspace/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz"

diff --git a/conf/general_files_IRB.config b/conf/general_files_IRB.config
@@ -8,9 +8,6 @@ params {
 
     wgs_trinuc_counts       = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
 
-    low_complex_file        = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
-    low_mappability_file    = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed"
-
     cosmic_ref_signatures   = "/data/bbg/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt"
     wgs_trinuc_counts       = "/data/bbg/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
     cadd_scores             = "/data/bbg/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz"

diff --git a/conf/mice.config b/conf/mice.config
@@ -31,7 +31,7 @@ params {
     datasets3d                 = "/data/bbg/nobackup/scratch/oncodrive3d/datasets_mouse"
     // annotations3d              = "/workspace/nobackup/scratch/oncodrive3d/annotations_240506"
 
-    omega_hotspots              = false
+    omega_withingene              = false
     omega_hotspots_bedfile      = "/data/bbg/datasets/transfer/ferriol_deepcsa/mouse_skin_panel.hotspots.bed4.bed"
     hotspot_expansion           = 15
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -356,13 +356,13 @@ process {
 
     withName: "GROUPGENES" {
         ext.custom          = params.custom_groups
-        ext.hotspots        = params.omega_hotspots
+        ext.hotspots        = params.omega_withingene
         ext.separator       = params.custom_groups_separator
     }
 
     withName: 'EXPANDREGIONS' {
         ext.expansion = params.omega_hotspots_bedfile ? params.hotspot_expansion : 0
-        ext.use_hotspot_bed = params.omega_hotspots_bedfile as Boolean
+        ext.using_bedfile = params.omega_autodomains | (params.omega_hotspots_bedfile as Boolean)
     }
 
     if (params.mutated_epithelium){

diff --git a/modules/local/expand_regions/main.nf b/modules/local/expand_regions/main.nf
@@ -3,16 +3,11 @@ process EXPAND_REGIONS {
     tag "$meta.id"
     label 'process_high'
 
-    // // conda "YOUR-TOOL-HERE"
-    // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-    //     'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE':
-    //     'biocontainers/YOUR-TOOL-HERE' }"
     container 'docker.io/ferriolcalvet/bgreference'
 
     input:
-    tuple val(meta)     , path(panel)
+    tuple val(meta), path(panel)
     path (bedfile)
-    // tuple val(meta2)    , path(bedfile)
 
     output:
     tuple val(meta), path("*with_hotspots.tsv") , emit: panel_increased
@@ -23,10 +18,15 @@ process EXPAND_REGIONS {
     task.ext.when == null || task.ext.when
 
     script:
-    def expansion = task.ext.expansion ?: 30
-    def configuration = task.ext.use_hotspot_bed ? "${bedfile} ${expansion} 1" : 'None 0 0'
+    def expansion = task.ext.expansion ?: 0
+    def bedfile_to_use = task.ext.using_bedfile ? "custom_regions_bedfiles.bed" : "None"
+    def autoexons = params.omega_autoexons ? "1" : "0"
     """
-    add_hotspots.py ${panel} ${configuration}
+    cat ${bedfile} >> custom_regions_bedfiles.bed
+    add_hotspots.py ${panel} \\
+                    ${bedfile_to_use} \\
+                    ${expansion} \\
+                    ${autoexons}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/process_annotation/domain/main.nf b/modules/local/process_annotation/domain/main.nf
@@ -0,0 +1,44 @@
+process DOMAIN_ANNOTATION {
+
+    tag "${meta.id}"
+
+    label 'cpu_low'
+    label 'time_low'
+    label 'process_high_memory'
+
+    container "docker.io/ferriolcalvet/bgreference"
+
+    input:
+    tuple val(meta) , path(panel_annotated)
+    path (domains_file)
+
+    output:
+    path("*.domains.bed4.bed")  , emit: domains_bed
+    path  "versions.yml"        , topic: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    panels_dna2domain.py \\
+                    --consensus-panel-rich ${panel_annotated} \\
+                    --domains ${domains_file} \\
+                    --output seq_panel.domains.bed4.bed;
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch consensus.domains.bed4.bed;
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/process_annotation/domain/meta.yml b/modules/local/process_annotation/domain/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -31,9 +31,11 @@ params {
     hotspots_annotation         = false
     hotspots_definition_file    = ''
 
-    omega_hotspots              = false
+    omega_withingene              = false
     omega_hotspots_bedfile      = null
-    hotspot_expansion           = 30
+    hotspot_expansion           = 0
+    omega_autodomains           = false
+    omega_autoexons             = false
 
     oncodrivefml               = false
     oncodriveclustl            = false
@@ -45,7 +47,7 @@ params {
     omega                      = false
     omega_globalloc            = false
     omega_mutabilities         = false
-    site_comparison_grouping   = 'aminoacid_change'
+    site_comparison_grouping   = 'all'
     omega_vaf_distorsioned     = false
     omega_plot                 = false
 
@@ -115,12 +117,6 @@ params {
     vep_params                 = "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade"
     vep_params_panel           = "--no_stats --cache --offline --symbol --protein --canonical"
 
-
-    // BED files for additional regional annotations
-    low_complex_file     = "hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
-    low_mappability_file = "GRCh38.encode.blacklist.bed"
-    // mutagenesis regions panel
-    // neutral/intergenic regions panel
 }
 
 // Global default params, used in configs