From 75a33a258af33dd6815eead334ad7b192474e398 Mon Sep 17 00:00:00 2001 From: Chris Wyatt <9978862+chriswyatt1@users.noreply.github.com> Date: Thu, 26 Jun 2025 16:55:54 +0100 Subject: [PATCH] Add new longest code from agat --- conf/base.config | 3 +++ conf/modules.config | 7 +++++++ main.nf | 13 ++++++++++++- modules/local/gffread.nf | 24 ++++++------------------ modules/local/longest.nf | 34 ++++++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 19 deletions(-) create mode 100644 modules/local/longest.nf diff --git a/conf/base.config b/conf/base.config index 44a985b..75f67ae 100755 --- a/conf/base.config +++ b/conf/base.config @@ -39,6 +39,9 @@ process { withLabel:process_high_memory { memory = { check_max( 64.GB * task.attempt, 'memory' ) } } + withLabel:process_med_memory { + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } withLabel:error_ignore { errorStrategy = 'ignore' } diff --git a/conf/modules.config b/conf/modules.config index 7eba59c..19a89ce 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,6 +11,13 @@ */ process { + withName: 'LONGEST' { + publishDir = [ + path: { "${params.outdir}/longest" }, + mode: params.publish_dir_mode + ] + } + withName: 'GFFREAD' { publishDir = [ path: { "${params.outdir}/gffread" }, diff --git a/main.nf b/main.nf index a1f629f..d1324e7 100644 --- a/main.nf +++ b/main.nf @@ -23,6 +23,7 @@ include { ORTHOFINDER as ORTHOFINDER_CAFE } from './modules/nf-core/orthofinder/ include { GO_ASSIGN } from './modules/local/go_assign.nf' include { GO_EXPANSION } from './modules/local/go_expansion.nf' include { NCBIGENOMEDOWNLOAD } from './modules/nf-core/ncbigenomedownload/main.nf' +include { LONGEST } from './modules/local/longest.nf' include { GFFREAD } from './modules/local/gffread.nf' include { CAFE } from './modules/local/cafe.nf' include { CHROMO_GO } from './modules/local/chromo_go.nf' @@ -63,7 +64,17 @@ workflow { NCBIGENOMEDOWNLOAD ( input_type.ncbi.map { it[0] }, input_type.ncbi.map { it[1] }, [], params.groups) ch_versions = ch_versions.mix(NCBIGENOMEDOWNLOAD.out.versions.first()) - GFFREAD ( NCBIGENOMEDOWNLOAD.out.fna.mix( input_type.local.map { [it[0],file(it[1])] } ), NCBIGENOMEDOWNLOAD.out.gff.mix(input_type.local.map { [it[0],file(it[2])] } ) ) + // Combine NCBI downloaded files with local files for LONGEST process + fasta_inputs = NCBIGENOMEDOWNLOAD.out.fna.mix( input_type.local.map { [it[0],file(it[1])] } ) + gff_inputs = NCBIGENOMEDOWNLOAD.out.gff.mix(input_type.local.map { [it[0],file(it[2])] } ) + + // Combine fasta and gff inputs for LONGEST process + fasta_inputs.join(gff_inputs).set { fasta_gff_inputs } + + LONGEST ( fasta_gff_inputs ) + ch_versions = ch_versions.mix(LONGEST.out.versions.first()) + + GFFREAD ( LONGEST.out.longest_proteins ) ch_versions = ch_versions.mix(GFFREAD.out.versions.first()) if (params.stats){ diff --git a/modules/local/gffread.nf b/modules/local/gffread.nf index 39543eb..32e5805 100644 --- a/modules/local/gffread.nf +++ b/modules/local/gffread.nf @@ -4,19 +4,18 @@ process GFFREAD { container = 'ecoflowucl/gffread_python:python-3.11.9_Linux_x86_64_perl-5.36.0' input: - tuple val(sample_id), path(fasta) - tuple val(sample_id), path(gff) + tuple val(sample_id), path(fasta), path(gff) output: path( "${sample_id}.prot.fa" ), emit: proteins - tuple val(sample_id), path("${sample_id}.prot.fa.largestIsoform.fa" ), emit: proteins_busco - path( "${sample_id}.prot.fa.largestIsoform.fa" ), emit: longest + tuple val(sample_id), path("${sample_id}.prot.fa" ), emit: proteins_busco + path( "${sample_id}.prot.fa" ), emit: longest path( "${sample_id}.splicedcds.fa" ) path( "${sample_id}.splicedexons.fa" ) path( "${sample_id}.gff_for_jvci.gff3" ), emit: gffs tuple val(sample_id), path("${sample_id}.gff_for_jvci.gff3"), emit: gffs_agat path( "${sample_id}_gene_alltran_list.txt" ), emit: gene_to_isoforms - path( "${sample_id}.splicedcds.fa.nucl.longest.fa" ) + path( "${sample_id}.splicedcds.fa" ) tuple val( "${sample_id}" ), path( "${fasta}" ), emit: fasta_quast path "versions.yml", emit: versions @@ -36,6 +35,7 @@ process GFFREAD { fi #Convert Augustus gff files if found, then do gffread to print out the nucleotide files for each gene. + # Note: The GFF input now comes from LONGEST process (agat longest isoform), so it should already be processed head -n 1 gff_temp > tbd @@ -59,20 +59,8 @@ process GFFREAD { fi + # Create gene to isoform mapping (still needed for downstream processes) ${projectDir}/bin/gff_to_genetranshash.2.pl - ${projectDir}/bin/prot_fasta_to_longest.pl ${sample_id}.prot.fa ${sample_id}_longestisoform.txt - ${projectDir}/bin/fasta_topIsoform.pl ${sample_id}.splicedcds.fa ${sample_id}_longestisoform.txt - - - #This part checks if longest isoform worked, if not we will continue with all proteins into Orthofinder. Warning sent to screen. - #Largest isoforms has content if true - #Largest isoforms does not have content if false. Just use full protein file (could be a genome without isoforms) - - if [[ -s ${sample_id}.prot.fa.largestIsoform.fa ]];then - echo all_good - else - cp ${sample_id}.prot.fa ${sample_id}.prot.fa.largestIsoform.fa - fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/longest.nf b/modules/local/longest.nf new file mode 100644 index 0000000..7263db3 --- /dev/null +++ b/modules/local/longest.nf @@ -0,0 +1,34 @@ +process LONGEST { + + label 'process_medium' + label 'process_med_memory' + tag "$sample_id" + container = 'quay.io/biocontainers/agat:1.4.1--pl5321hdfd78af_0' + + input: + tuple val (sample_id), path(fasta), path(gff) + + output: + tuple val (sample_id), path( fasta ), path( "${sample_id}.longest.gff3" ), emit: longest_proteins + tuple val (sample_id), path( "${sample_id}.stat.original.txt" ), emit: agat_summary_original + tuple val (sample_id), path( "${sample_id}.stat.long.txt" ), emit: agat_summary_longest + path "versions.yml", emit: versions + + script: + """ + # Run agat to find longest orf for each gene + agat_sp_keep_longest_isoform.pl -gff ${gff} -o ${sample_id}.longest.gff3 + + # Run a few summarisation scripts to report the actual genes being considered. + agat_sp_functional_statistics.pl --gff ${gff} -o ${sample_id}.stat.original.txt + agat_sp_functional_statistics.pl --gff ${sample_id}.longest.gff3 -o ${sample_id}.stat.long.txt + + md5sum "${sample_id}.longest.gff3" > "${sample_id}.longest.gff3.md5" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Perl version: \$(perl --version | grep "version" | sed 's/.*(//g' | sed 's/[)].*//') + END_VERSIONS + """ + +} \ No newline at end of file