From 8ee5f90a8a1d786e940b9fa05731beff3889f219 Mon Sep 17 00:00:00 2001
From: Hunter Cameron <hunter.cameron418@gmail.com>
Date: Wed, 3 Jun 2015 11:47:31 -0400
Subject: [PATCH 1/3] Feature: print a FASTA of marker genes for each bin

Added the ability to print a FASTA file (amino acid) that includes the sequences
of the genes that were identified as marker genes in each bin. Feature is
implemented in the qa subprogram as a new output format (number 10).

Each sequence is printed with an informative header that takes the form:

>$bin_id $contig_name geneId=$gene_num;start=$start;end=$end;strand=$strand marker=$marker;mstart=$mstart;mend=$mend

Where $ denotes a variable.

	Variables:
	- bin_id: FASTA bin the sequence comes from
	- contig_name: contig the sequence comes from
	- gene_num: gene number as assigned by Prodigal
	- start: gene start position on contig (nucleotide)
	- end: gene end position on contig (nucleotide)
	- strand: strand the gene is on, 1 or -1
	- marker: name of the marker that was identified as a match to the gene
	- mstart: start position on gene of alignment with marker (amino acid)
	- mend: end position on gene of alignment with marker (amino acid)
---
 checkm/main.py          |  2 +-
 checkm/resultsParser.py | 66 +++++++++++++++++++++++++++++++++++++----
 checkm/util/seqUtils.py |  7 +++--
 3 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/checkm/main.py b/checkm/main.py
index dbf3611..b4851ee 100644
--- a/checkm/main.py
+++ b/checkm/main.py
@@ -393,7 +393,7 @@ def qa(self, options):
                           )
 
         self.logger.info('')
-        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)
+        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)
         RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)
 
         if options.file != '':
diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py
index 07fd7c0..17ab280 100644
--- a/checkm/resultsParser.py
+++ b/checkm/resultsParser.py
@@ -35,7 +35,7 @@
 from checkm.hmmer import HMMERParser
 
 from checkm.util.pfam import PFAM
-
+from checkm.util.seqUtils import readFasta, writeFasta
 
 class ResultsParser():
     """Parse output of Prodigal+HMMER run and derived statistics."""
@@ -234,10 +234,12 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
             header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
         elif outputFormat == 9:
             header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
+        elif outputFormat == 10:
+            header = None
 
         return header
 
-    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
+    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
         # redirect output
         oldStdOut = reassignStdOut(outFile)
 
@@ -265,7 +267,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
 
         seqsReported = 0
         for binId in sorted(self.results.keys()):
-            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
+            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)
 
         if outputFormat in [6, 7] and seqsReported == 0:
             print('[No marker genes satisfied the reporting criteria.]')
@@ -622,7 +624,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
 
         return summary
 
-    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
+    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):
+
         """Print out information about bin."""
         if outputFormat == 1:
             selectedMarkerSet = binMarkerSets.selectedMarkerSet()
@@ -789,13 +792,66 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
                 print(rowStr)
 
+        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
+        elif outputFormat == 10:
+            # tabular of bin_id, marker, contig_id
+           
+            # check for the analyze folder for later use
+            if anaFolder is None:
+                raise ValueError("AnaFolder must not be None for outputFormat 10")
+
+            ### build a dict to link target_names with marker gene alignment information
+            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
+            hitInfo = {}
+            for marker, hit_list in self.markerHits.items():
+                if marker not in markerGenes:
+                    continue
+
+                for hit in hit_list:
+                    name = hit.target_name
+                    hitInfo[name] = {
+                            "marker": marker,
+                            "ali_from": hit.ali_from,
+                            "ali_to": hit.ali_to
+                            }
+
+            
+            ### Open genes.faa and print the ones that were found with some descriptive info in the header
+            path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
+            for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems():
+                elems = header.split(" # ")
+                gene_name = elems[0]
+                if gene_name in hitInfo:
+
+                    # remove the gene number from Prodigal to get the original contig name
+                    contig_name, gene_num = gene_name.rsplit("_", 1)
+
+                    # parse some info about the gene from the header line
+                    gene_start = elems[1]
+                    gene_end = elems[2]
+                    gene_strand = elems[3]
+
+                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
+                            gene_num, gene_start, gene_end, gene_strand, str(len(seq)))
+                    
+                    marker_info = "marker={};mstart={};mend={}".format(
+                            hitInfo[gene_name]["marker"], 
+                            hitInfo[gene_name]["ali_from"], 
+                            hitInfo[gene_name]["ali_to"])
+
+                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
+                    new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
+
+                    print(new_header, seq, sep="\n")
+            
+
         else:
             self.logger.error("Unknown output format: %d", outputFormat)
 
         return 0
 
         '''
-        elif outputFormat == 9:
+        elif outputFormat == 10:
             markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
 
             markersInScaffold = {}
diff --git a/checkm/util/seqUtils.py b/checkm/util/seqUtils.py
index e083afa..e2efa46 100644
--- a/checkm/util/seqUtils.py
+++ b/checkm/util/seqUtils.py
@@ -24,7 +24,7 @@
 import logging
 
 
-def readFasta(fastaFile):
+def readFasta(fastaFile, trimHeader=True):
     '''Read sequences from FASTA file.'''
     try:
         if fastaFile.endswith('.gz'):
@@ -39,7 +39,10 @@ def readFasta(fastaFile):
                 continue
 
             if line[0] == '>':
-                seqId = line[1:].split(None, 1)[0]
+                if trimHeader:
+                    seqId = line[1:].split(None, 1)[0]
+                else:
+                    seqId = line[1:].rstrip()
                 seqs[seqId] = []
             else:
                 seqs[seqId].append(line[0:-1])

From 0b29f9de2403c9adc813485033619ee07c0c9c4e Mon Sep 17 00:00:00 2001
From: Hunter Cameron <hunter.cameron418@gmail.com>
Date: Wed, 3 Jun 2015 12:27:44 -0400
Subject: [PATCH 2/3] Added argparse option for out 10

---
 bin/checkm | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/checkm b/bin/checkm
index 1a98f0d..730f021 100755
--- a/bin/checkm
+++ b/bin/checkm
@@ -247,9 +247,10 @@ Example: checkm qa ./output/lineage.ms ./output
   5. list of bin id, marker gene id, gene id
   6. list of marker genes present multiple times in a bin
   7. list of marker genes present multiple times on the same scaffold
-  8. list indicating position of each marker gene within a bin''',
+  8. list indicating position of each marker gene within a bin
+  10. FASTA of marker genes identified in each bin''',
   # 9. scaffold statistics: scaffold id, bin id, length, GC, ..., marker gene(s)''',
-                                default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8])
+                                default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8, 10])
     qa_parser.add_argument('--exclude_markers', default=None, help="file specifying marker to exclude from marker sets")
     qa_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
     qa_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")

From 377205313a53db8c0f420d10cdef99a0ac9234c7 Mon Sep 17 00:00:00 2001
From: Hunter Cameron <hunter.cameron418@gmail.com>
Date: Thu, 4 Jun 2015 11:58:51 -0400
Subject: [PATCH 3/3] added table output to QA format 10

---
 checkm/resultsParser.py | 77 +++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 18 deletions(-)

diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py
index 17ab280..df90284 100644
--- a/checkm/resultsParser.py
+++ b/checkm/resultsParser.py
@@ -196,7 +196,7 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection):
         except IOError as detail:
             sys.stderr.write(str(detail) + "\n")
 
-    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
+    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None):
         """Get header for requested output table."""
 
         # keep count of single, double, triple genes etc...
@@ -235,7 +235,10 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
         elif outputFormat == 9:
             header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
         elif outputFormat == 10:
-            header = None
+            if table is not None:
+                header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence']
+            else:
+                header = " "
 
         return header
 
@@ -248,9 +251,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
             coverage = Coverage(1)
             coverageBinProfiles = coverage.binProfiles(coverageFile)
 
-        prettyTableFormats = [1, 2, 3]
+        prettyTableFormats = [1, 2, 3, 10]
 
-        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles)
+        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable)
         if bTabTable or outputFormat not in prettyTableFormats:
             bTabTable = True
             pTable = None
@@ -276,7 +279,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
             if outputFormat in [1, 2]:
                 print(pTable.get_string(sortby='Completeness', reversesort=True))
             else:
-                print(pTable.get_string())
+                # only print if there are rows
+                if pTable.get_string(print_empty=False):
+                    print(pTable.get_string(print_empty=False))
 
         # restore stdout
         restoreStdOut(outFile, oldStdOut)
@@ -811,28 +816,49 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     name = hit.target_name
                     hitInfo[name] = {
                             "marker": marker,
-                            "ali_from": hit.ali_from,
-                            "ali_to": hit.ali_to
+                            "ali_from": str(hit.ali_from),
+                            "ali_to": str(hit.ali_to)
                             }
 
             
             ### Open genes.faa and print the ones that were found with some descriptive info in the header
             path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
-            for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems():
+
+
+            # get only the seqs we need and their information as a dict
+            seqs = readFasta(path_to_genes, trimHeader=False)
+
+            filt_seqs = []
+            # remove seqs without markers
+            for header in seqs.keys():
+                gene_name = header.split(" # ")[0]
+                if gene_name in hitInfo:
+                    filt_seqs.append(header)
+
+
+            def sort_header(header):
+                """ sorts headers by contig and gene number """
+                name = header.split(" # ")[0]
+                ctg_name, gene_num = name.rsplit("_", 1)
+                return ctg_name, int(gene_num)
+
+
+            for header in sorted(filt_seqs, key=sort_header):
                 elems = header.split(" # ")
                 gene_name = elems[0]
-                if gene_name in hitInfo:
 
-                    # remove the gene number from Prodigal to get the original contig name
-                    contig_name, gene_num = gene_name.rsplit("_", 1)
+                # remove the gene number from Prodigal to get the original contig name
+                contig_name, gene_num = gene_name.rsplit("_", 1)
 
-                    # parse some info about the gene from the header line
-                    gene_start = elems[1]
-                    gene_end = elems[2]
-                    gene_strand = elems[3]
+                # parse some info about the gene from the header line
+                gene_start = elems[1]
+                gene_end = elems[2]
+                gene_strand = elems[3]
 
+                # if table output not specified, print FASTA
+                if table != None:
                     gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
-                            gene_num, gene_start, gene_end, gene_strand, str(len(seq)))
+                            gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])))
                     
                     marker_info = "marker={};mstart={};mend={}".format(
                             hitInfo[gene_name]["marker"], 
@@ -842,9 +868,24 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     # new header will be the bin name, contig name, gene info, and marker info separated by spaces
                     new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
 
-                    print(new_header, seq, sep="\n")
-            
+                    print(new_header, seqs[header], sep="\n")
+                # otherwise, print a table
+                else:
+                    print("\t".join([
+                            self.binId, 
+                            contig_name,
+                            gene_num,
+                            gene_start,
+                            gene_end,
+                            gene_strand,
+                            str(len(seqs[header])),
+                            hitInfo[gene_name]["marker"],
+                            hitInfo[gene_name]["ali_from"],
+                            hitInfo[gene_name]["ali_to"],
+                            seqs[header]
+                            ]))
 
+            
         else:
             self.logger.error("Unknown output format: %d", outputFormat)