Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions bin/add_hotspots.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,36 @@
import json
import pandas as pd
import numpy as np
from utils import to_int_if_possible
from read_utils import custom_na_values


panel_file = sys.argv[1]
bedfile = sys.argv[2] # BED file (optional, can be None)
expand = int(sys.argv[3]) # Expansion factor for defining regions
use_bed = bool(int(sys.argv[4])) # Whether to use the BED file for exon definitions
autoexons = bool(int(sys.argv[4])) # Whether to use the BED file for exon definitions

# Read input data
panel_data = pd.read_table(panel_file)

if use_bed:
exons_bed = pd.DataFrame()
exons_bed_provided = pd.DataFrame()

if bedfile != 'None':
print("Using the BED file provided")
# Use BED file for exon definition
exons_bed = pd.read_table(bedfile, header=None, sep="\t")
exons_bed_provided = pd.read_table(bedfile, header=None, sep="\t")
# Check if a name column is provided
if exons_bed.shape[1] > 3:
exons_bed = exons_bed.iloc[:,:4]
exons_bed.columns = ["CHROM", "START", "END", "NAME"]
if exons_bed_provided.shape[1] > 3:
exons_bed_provided = exons_bed_provided.iloc[:,:4]
exons_bed_provided.columns = ["CHROM", "START", "END", "NAME"]
else:
exons_bed = exons_bed.iloc[:,:3]
exons_bed.columns = ["CHROM", "START", "END"]
exons_bed["NAME"] = None
print(exons_bed)

exons_bed_provided = exons_bed_provided.iloc[:,:3]
exons_bed_provided.columns = ["CHROM", "START", "END"]
exons_bed_provided["NAME"] = None
else:
print("NOT using any BED file")

if autoexons:
print("Automated generation of exons from the panel")
# Use breakpoints to define exons
exons_bed = []
for chrom, gene_group in panel_data.groupby(["CHROM", "GENE"]):
Expand All @@ -47,7 +52,15 @@
# Append to exons
exons_bed.extend([{"CHROM": chrom[0], "START": s, "END": e, "NAME": None, "GENE": chrom[1]} for s, e in zip(starts, ends)])
exons_bed = pd.DataFrame(exons_bed)
else:
print("NO automated generation of exons from the panel")


# Concatenate the previous two BED dataframes
exons_bed = pd.concat((exons_bed, exons_bed_provided))

if exons_bed.shape[0] == 0:
raise ValueError("No regions to extend")

## Process panel data to create new genes per exon
new_data = pd.DataFrame()
Expand Down Expand Up @@ -81,7 +94,7 @@
exon_number = exon_counters[gene]
region_name = f"{gene}--exon_{exon_number}"

if use_bed:
if ("--exon_" not in region_name) and (expand > 0):
# Expand within limits of the same gene only if BED file is used
upd_start = max(ind_start - expand * 3, 0)
while upd_start < len(chr_data) and chr_data.iloc[upd_start]["GENE"] != gene:
Expand Down
92 changes: 92 additions & 0 deletions bin/panels_dna2domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/local/bin/python


# Third-party imports
import click
import pandas as pd


# ---------------------- DNA and Protein Mapping ---------------------- #
def generate_domains2dna_mapping(consensus_panel_rich_file, domains_file, output_file):
"""
Main function to process DNA to protein mapping.
"""

all_protein_positions = pd.read_table(consensus_panel_rich_file)[['CHROM', 'POS', 'STRAND', 'GENE', 'Feature', 'Protein_position']]
all_protein_positions = all_protein_positions[all_protein_positions["Protein_position"] != "-"].drop_duplicates()
all_protein_positions["Protein_position"] = all_protein_positions["Protein_position"].astype(float)
all_protein_positions_summary = all_protein_positions[~(all_protein_positions["Protein_position"].isna())
][['GENE', 'Protein_position', "CHROM", "POS", "STRAND"]].reset_index(drop = True)

panel_all_genes = all_protein_positions["Feature"].unique()

#pfam_domains = pd.read_table("/data/bbg/nobackup/scratch/oncodrive3d/annotations_mane_240506/pfam.tsv")
pfam_domains = pd.read_table(domains_file)
pfam_domains["NAME"] = pfam_domains["Gene"] + '--' + pfam_domains["Description"].str.replace("-", "_") + '-' + pfam_domains["Ens_Transcr_ID"]

pfam_domains_summary = pfam_domains[['Gene', 'Ens_Transcr_ID', 'Begin', 'End','NAME']]
pfam_domains_summary = pfam_domains_summary[pfam_domains_summary["Ens_Transcr_ID"].isin(panel_all_genes)].reset_index(drop = True)



all_protein_positions_small_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"],
ascending = True).drop_duplicates(
subset = ['GENE', 'Protein_position'],
keep = 'first')
all_protein_positions_small_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'START_fw', "STRAND"]
all_protein_positions_big_summary = all_protein_positions_summary.sort_values(by = ["CHROM", "POS"],
ascending = True).drop_duplicates(
subset = ['GENE', 'Protein_position'],
keep = 'last')
all_protein_positions_big_summary.columns = ['GENE', 'PROT_POS', 'CHROM', 'END_fw', "STRAND"]



pfam_domains_genomic = pfam_domains_summary.merge(all_protein_positions_small_summary,
left_on = ['Gene', 'Begin'],
right_on = ["GENE", 'PROT_POS'],
how = 'left').merge(all_protein_positions_big_summary,
left_on = ['GENE', 'End', "CHROM", "STRAND"],
right_on = ["GENE", 'PROT_POS', "CHROM", "STRAND"],
how = 'left')
pfam_domains_genomic_summary = pfam_domains_genomic[['CHROM', 'START_fw', 'END_fw', 'NAME', "STRAND"]].copy()
pfam_domains_genomic_summary = pfam_domains_genomic_summary[
(~(pfam_domains_genomic_summary["CHROM"].isna())) &
(~(pfam_domains_genomic_summary["START_fw"].isna())) &
(~(pfam_domains_genomic_summary["END_fw"].isna()))
].reset_index(drop = True)
pfam_domains_genomic_summary["START_fw"] = pfam_domains_genomic_summary["START_fw"].astype(int)
pfam_domains_genomic_summary["END_fw"] = pfam_domains_genomic_summary["END_fw"].astype(int)



pfam_domains_genomic_summary[["START","END"]] = pfam_domains_genomic_summary[["START_fw", "END_fw", "STRAND"]].apply(
lambda x : (x["END_fw"], x["START_fw"]) if x["STRAND"] == -1 else (x["START_fw"], x["END_fw"]),
axis = 1
).to_list()
pfam_domains_genomic_summary = pfam_domains_genomic_summary[['CHROM', 'START', 'END', 'NAME']]
pfam_domains_genomic_summary["START"] = pfam_domains_genomic_summary["START"].astype(int)
pfam_domains_genomic_summary["END"] = pfam_domains_genomic_summary["END"].astype(int)

pfam_domains_genomic_summary.to_csv(output_file, header = False, index = False, sep = '\t')



# ---------------------- CLI Interface ---------------------- #

@click.command()
@click.option('--consensus-panel-rich', type=click.Path(exists=True), help='Input consensus panel file')
@click.option('--domains', type=click.Path(exists=True), help='Input consensus panel file')
@click.option('--output', type=click.Path(), help='Output file')

def main(consensus_panel_rich, domains, output):
click.echo("Mapping the domains to the genomic sequence...")
generate_domains2dna_mapping(consensus_panel_rich, domains, output)



if __name__ == '__main__':
main()


# ---------------------- End of the script ---------------------- #
2 changes: 1 addition & 1 deletion conf/bladder.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ params {
pileup_all_duplex = false
store_depths = false

omega_hotspots = false
omega_withingene = false
omega_hotspots_bedfile = "/data/bbg/datasets/bladder_ts/Methods/LBL_v2.20220824.only_hotspots.bed4.bed"
hotspot_expansion = 30

Expand Down
3 changes: 0 additions & 3 deletions conf/general_files_BBG.config
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ params {

wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"

low_complex_file = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
low_mappability_file = "/workspace/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed"

cosmic_ref_signatures = "/workspace/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt"
wgs_trinuc_counts = "/workspace/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
cadd_scores = "/workspace/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz"
Expand Down
3 changes: 0 additions & 3 deletions conf/general_files_IRB.config
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ params {

wgs_trinuc_counts = "assets/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"

low_complex_file = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
low_mappability_file = "/data/bbg/datasets/genomes/GRCh38/masking_bedfiles/blacklist/GRCh38.encode.blacklist.bed"

cosmic_ref_signatures = "/data/bbg/datasets/transfer/ferriol_deepcsa/COSMIC_v3.4_SBS_GRCh38.txt"
wgs_trinuc_counts = "/data/bbg/datasets/transfer/ferriol_deepcsa/trinucleotide_counts/trinuc_counts.homo_sapiens.tsv"
cadd_scores = "/data/bbg/datasets/CADD/v1.7/hg38/whole_genome_SNVs.tsv.gz"
Expand Down
2 changes: 1 addition & 1 deletion conf/mice.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ params {
datasets3d = "/data/bbg/nobackup/scratch/oncodrive3d/datasets_mouse"
// annotations3d = "/workspace/nobackup/scratch/oncodrive3d/annotations_240506"

omega_hotspots = false
omega_withingene = false
omega_hotspots_bedfile = "/data/bbg/datasets/transfer/ferriol_deepcsa/mouse_skin_panel.hotspots.bed4.bed"
hotspot_expansion = 15

Expand Down
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -356,13 +356,13 @@ process {

withName: "GROUPGENES" {
ext.custom = params.custom_groups
ext.hotspots = params.omega_hotspots
ext.hotspots = params.omega_withingene
ext.separator = params.custom_groups_separator
}

withName: 'EXPANDREGIONS' {
ext.expansion = params.omega_hotspots_bedfile ? params.hotspot_expansion : 0
ext.use_hotspot_bed = params.omega_hotspots_bedfile as Boolean
ext.using_bedfile = params.omega_autodomains | (params.omega_hotspots_bedfile as Boolean)
}

if (params.mutated_epithelium){
Expand Down
18 changes: 9 additions & 9 deletions modules/local/expand_regions/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,11 @@ process EXPAND_REGIONS {
tag "$meta.id"
label 'process_high'

// // conda "YOUR-TOOL-HERE"
// container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
// 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE':
// 'biocontainers/YOUR-TOOL-HERE' }"
container 'docker.io/ferriolcalvet/bgreference'

input:
tuple val(meta) , path(panel)
tuple val(meta), path(panel)
path (bedfile)
// tuple val(meta2) , path(bedfile)

output:
tuple val(meta), path("*with_hotspots.tsv") , emit: panel_increased
Expand All @@ -23,10 +18,15 @@ process EXPAND_REGIONS {
task.ext.when == null || task.ext.when

script:
def expansion = task.ext.expansion ?: 30
def configuration = task.ext.use_hotspot_bed ? "${bedfile} ${expansion} 1" : 'None 0 0'
def expansion = task.ext.expansion ?: 0
def bedfile_to_use = task.ext.using_bedfile ? "custom_regions_bedfiles.bed" : "None"
def autoexons = params.omega_autoexons ? "1" : "0"
"""
add_hotspots.py ${panel} ${configuration}
cat ${bedfile} >> custom_regions_bedfiles.bed
add_hotspots.py ${panel} \\
${bedfile_to_use} \\
${expansion} \\
${autoexons}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
44 changes: 44 additions & 0 deletions modules/local/process_annotation/domain/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
process DOMAIN_ANNOTATION {

tag "${meta.id}"

label 'cpu_low'
label 'time_low'
label 'process_high_memory'

container "docker.io/ferriolcalvet/bgreference"

input:
tuple val(meta) , path(panel_annotated)
path (domains_file)

output:
path("*.domains.bed4.bed") , emit: domains_bed
path "versions.yml" , topic: versions

when:
task.ext.when == null || task.ext.when

script:
"""
panels_dna2domain.py \\
--consensus-panel-rich ${panel_annotated} \\
--domains ${domains_file} \\
--output seq_panel.domains.bed4.bed;

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""

stub:
"""
touch consensus.domains.bed4.bed;

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}
Empty file.
14 changes: 5 additions & 9 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ params {
hotspots_annotation = false
hotspots_definition_file = ''

omega_hotspots = false
omega_withingene = false
omega_hotspots_bedfile = null
hotspot_expansion = 30
hotspot_expansion = 0
omega_autodomains = false
omega_autoexons = false

oncodrivefml = false
oncodriveclustl = false
Expand All @@ -45,7 +47,7 @@ params {
omega = false
omega_globalloc = false
omega_mutabilities = false
site_comparison_grouping = 'aminoacid_change'
site_comparison_grouping = 'all'
omega_vaf_distorsioned = false
omega_plot = false

Expand Down Expand Up @@ -115,12 +117,6 @@ params {
vep_params = "--no_stats --cache --offline --symbol --protein --canonical --af_gnomadg --af_gnomade"
vep_params_panel = "--no_stats --cache --offline --symbol --protein --canonical"


// BED files for additional regional annotations
low_complex_file = "hg38_lowcomplexity_repetitive_regions.mutcoords.bed"
low_mappability_file = "GRCh38.encode.blacklist.bed"
// mutagenesis regions panel
// neutral/intergenic regions panel
}

// Global default params, used in configs
Expand Down
Loading