From 75a33a258af33dd6815eead334ad7b192474e398 Mon Sep 17 00:00:00 2001
From: Chris Wyatt <9978862+chriswyatt1@users.noreply.github.com>
Date: Thu, 26 Jun 2025 16:55:54 +0100
Subject: [PATCH] Add new longest code from agat

---
 conf/base.config         |  3 +++
 conf/modules.config      |  7 +++++++
 main.nf                  | 13 ++++++++++++-
 modules/local/gffread.nf | 24 ++++++------------------
 modules/local/longest.nf | 34 ++++++++++++++++++++++++++++++++++
 5 files changed, 62 insertions(+), 19 deletions(-)
 create mode 100644 modules/local/longest.nf

diff --git a/conf/base.config b/conf/base.config
index 44a985b..75f67ae 100755
--- a/conf/base.config
+++ b/conf/base.config
@@ -39,6 +39,9 @@ process {
     withLabel:process_high_memory {
         memory = { check_max( 64.GB * task.attempt, 'memory' ) }
     }
+    withLabel:process_med_memory {
+        memory = { check_max( 16.GB * task.attempt, 'memory' ) }
+    }
     withLabel:error_ignore {
         errorStrategy = 'ignore'
     }
diff --git a/conf/modules.config b/conf/modules.config
index 7eba59c..19a89ce 100755
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -11,6 +11,13 @@
 */
 
 process {
+    withName: 'LONGEST' {
+        publishDir = [
+            path: { "${params.outdir}/longest" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
     withName: 'GFFREAD' {
         publishDir = [
             path: { "${params.outdir}/gffread" },
diff --git a/main.nf b/main.nf
index a1f629f..d1324e7 100644
--- a/main.nf
+++ b/main.nf
@@ -23,6 +23,7 @@ include { ORTHOFINDER as ORTHOFINDER_CAFE } from './modules/nf-core/orthofinder/
 include { GO_ASSIGN } from './modules/local/go_assign.nf'
 include { GO_EXPANSION  } from './modules/local/go_expansion.nf'
 include { NCBIGENOMEDOWNLOAD } from './modules/nf-core/ncbigenomedownload/main.nf'
+include { LONGEST } from './modules/local/longest.nf'
 include { GFFREAD } from './modules/local/gffread.nf'
 include { CAFE } from './modules/local/cafe.nf'
 include { CHROMO_GO } from './modules/local/chromo_go.nf'
@@ -63,7 +64,17 @@ workflow {
    NCBIGENOMEDOWNLOAD ( input_type.ncbi.map { it[0] }, input_type.ncbi.map { it[1] }, [], params.groups)
    ch_versions = ch_versions.mix(NCBIGENOMEDOWNLOAD.out.versions.first())  
 
-   GFFREAD ( NCBIGENOMEDOWNLOAD.out.fna.mix( input_type.local.map { [it[0],file(it[1])] } ), NCBIGENOMEDOWNLOAD.out.gff.mix(input_type.local.map { [it[0],file(it[2])] } ) ) 
+   // Combine NCBI downloaded files with local files for LONGEST process
+   fasta_inputs = NCBIGENOMEDOWNLOAD.out.fna.mix( input_type.local.map { [it[0],file(it[1])] } )
+   gff_inputs = NCBIGENOMEDOWNLOAD.out.gff.mix(input_type.local.map { [it[0],file(it[2])] } )
+   
+   // Combine fasta and gff inputs for LONGEST process
+   fasta_inputs.join(gff_inputs).set { fasta_gff_inputs }
+
+   LONGEST ( fasta_gff_inputs )
+   ch_versions = ch_versions.mix(LONGEST.out.versions.first())
+
+   GFFREAD ( LONGEST.out.longest_proteins ) 
    ch_versions = ch_versions.mix(GFFREAD.out.versions.first())
 
    if (params.stats){
diff --git a/modules/local/gffread.nf b/modules/local/gffread.nf
index 39543eb..32e5805 100644
--- a/modules/local/gffread.nf
+++ b/modules/local/gffread.nf
@@ -4,19 +4,18 @@ process GFFREAD {
     container = 'ecoflowucl/gffread_python:python-3.11.9_Linux_x86_64_perl-5.36.0'
 
     input:
-    tuple val(sample_id), path(fasta)
-    tuple val(sample_id), path(gff)
+    tuple val(sample_id), path(fasta), path(gff)
 
     output:
     path( "${sample_id}.prot.fa" ), emit: proteins
-    tuple val(sample_id), path("${sample_id}.prot.fa.largestIsoform.fa" ), emit: proteins_busco
-    path( "${sample_id}.prot.fa.largestIsoform.fa" ), emit: longest
+    tuple val(sample_id), path("${sample_id}.prot.fa" ), emit: proteins_busco
+    path( "${sample_id}.prot.fa" ), emit: longest
     path( "${sample_id}.splicedcds.fa" )
     path( "${sample_id}.splicedexons.fa" )
     path( "${sample_id}.gff_for_jvci.gff3" ), emit: gffs
     tuple val(sample_id), path("${sample_id}.gff_for_jvci.gff3"), emit: gffs_agat
     path( "${sample_id}_gene_alltran_list.txt" ), emit: gene_to_isoforms
-    path( "${sample_id}.splicedcds.fa.nucl.longest.fa" )
+    path( "${sample_id}.splicedcds.fa" )
     tuple val( "${sample_id}" ), path( "${fasta}" ), emit: fasta_quast
     path "versions.yml", emit: versions
 
@@ -36,6 +35,7 @@ process GFFREAD {
     fi
 
     #Convert Augustus gff files if found, then do gffread to print out the nucleotide files for each gene.
+    # Note: The GFF input now comes from LONGEST process (agat longest isoform), so it should already be processed
 
     head -n 1 gff_temp > tbd
 
@@ -59,20 +59,8 @@ process GFFREAD {
 
     fi
 
+    # Create gene to isoform mapping (still needed for downstream processes)
     ${projectDir}/bin/gff_to_genetranshash.2.pl
-    ${projectDir}/bin/prot_fasta_to_longest.pl ${sample_id}.prot.fa ${sample_id}_longestisoform.txt
-    ${projectDir}/bin/fasta_topIsoform.pl ${sample_id}.splicedcds.fa ${sample_id}_longestisoform.txt
-
-
-    #This part checks if longest isoform worked, if not we will continue with all proteins into Orthofinder. Warning sent to screen.
-    #Largest isoforms has content if true
-    #Largest isoforms does not have content if false. Just use full protein file (could be a genome without isoforms)
-
-    if [[ -s ${sample_id}.prot.fa.largestIsoform.fa ]];then
-      echo all_good
-    else
-      cp ${sample_id}.prot.fa ${sample_id}.prot.fa.largestIsoform.fa
-    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/longest.nf b/modules/local/longest.nf
new file mode 100644
index 0000000..7263db3
--- /dev/null
+++ b/modules/local/longest.nf
@@ -0,0 +1,34 @@
+process LONGEST {
+
+    label 'process_medium'
+    label 'process_med_memory'
+    tag "$sample_id"
+    container = 'quay.io/biocontainers/agat:1.4.1--pl5321hdfd78af_0'
+
+    input:
+    tuple val (sample_id), path(fasta), path(gff)
+
+    output:
+    tuple val (sample_id), path( fasta ), path( "${sample_id}.longest.gff3" ), emit: longest_proteins
+    tuple val (sample_id), path( "${sample_id}.stat.original.txt" ), emit: agat_summary_original
+    tuple val (sample_id), path( "${sample_id}.stat.long.txt" ), emit: agat_summary_longest
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    # Run agat to find longest orf for each gene 
+    agat_sp_keep_longest_isoform.pl -gff ${gff} -o ${sample_id}.longest.gff3
+    
+    # Run a few summarisation scripts to report the actual genes being considered.
+    agat_sp_functional_statistics.pl --gff ${gff} -o ${sample_id}.stat.original.txt
+    agat_sp_functional_statistics.pl --gff ${sample_id}.longest.gff3 -o ${sample_id}.stat.long.txt
+    
+    md5sum "${sample_id}.longest.gff3" > "${sample_id}.longest.gff3.md5"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        Perl version: \$(perl --version | grep "version" | sed 's/.*(//g' | sed 's/[)].*//')
+    END_VERSIONS
+    """
+
+}
\ No newline at end of file