Ecogenomics · donovan-h-parks · Jun 8, 2015 · Jun 3, 2015 · Jun 3, 2015 · Jun 4, 2015
diff --git a/bin/checkm b/bin/checkm
@@ -247,9 +247,10 @@ Example: checkm qa ./output/lineage.ms ./output
   5. list of bin id, marker gene id, gene id
   6. list of marker genes present multiple times in a bin
   7. list of marker genes present multiple times on the same scaffold
-  8. list indicating position of each marker gene within a bin''',
+  8. list indicating position of each marker gene within a bin
+  10. FASTA of marker genes identified in each bin''',
   # 9. scaffold statistics: scaffold id, bin id, length, GC, ..., marker gene(s)''',
-                                default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8])
+                                default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8, 10])
     qa_parser.add_argument('--exclude_markers', default=None, help="file specifying marker to exclude from marker sets")
     qa_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
     qa_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")

diff --git a/checkm/main.py b/checkm/main.py
@@ -393,7 +393,7 @@ def qa(self, options):
                           )
 
         self.logger.info('')
-        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)
+        RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)
         RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)
 
         if options.file != '':

diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py
@@ -35,7 +35,7 @@
 from checkm.hmmer import HMMERParser
 
 from checkm.util.pfam import PFAM
-
+from checkm.util.seqUtils import readFasta, writeFasta
 
 class ResultsParser():
     """Parse output of Prodigal+HMMER run and derived statistics."""
@@ -196,7 +196,7 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection):
         except IOError as detail:
             sys.stderr.write(str(detail) + "\n")
 
-    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
+    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None):
         """Get header for requested output table."""
 
         # keep count of single, double, triple genes etc...
@@ -234,10 +234,15 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
             header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
         elif outputFormat == 9:
             header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
+        elif outputFormat == 10:
+            if table is not None:
+                header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence']
+            else:
+                header = " "
 
         return header
 
-    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
+    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
         # redirect output
         oldStdOut = reassignStdOut(outFile)
 
@@ -246,9 +251,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
             coverage = Coverage(1)
             coverageBinProfiles = coverage.binProfiles(coverageFile)
 
-        prettyTableFormats = [1, 2, 3]
+        prettyTableFormats = [1, 2, 3, 10]
 
-        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles)
+        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable)
         if bTabTable or outputFormat not in prettyTableFormats:
             bTabTable = True
             pTable = None
@@ -265,7 +270,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
 
         seqsReported = 0
         for binId in sorted(self.results.keys()):
-            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
+            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)
 
         if outputFormat in [6, 7] and seqsReported == 0:
             print('[No marker genes satisfied the reporting criteria.]')
@@ -274,7 +279,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
             if outputFormat in [1, 2]:
                 print(pTable.get_string(sortby='Completeness', reversesort=True))
             else:
-                print(pTable.get_string())
+                # only print if there are rows
+                if pTable.get_string(print_empty=False):
+                    print(pTable.get_string(print_empty=False))
 
         # restore stdout
         restoreStdOut(outFile, oldStdOut)
@@ -622,7 +629,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
 
         return summary
 
-    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
+    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):
+
         """Print out information about bin."""
         if outputFormat == 1:
             selectedMarkerSet = binMarkerSets.selectedMarkerSet()
@@ -789,13 +797,102 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
                     rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
                 print(rowStr)
 
+        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
+        elif outputFormat == 10:
+            # tabular of bin_id, marker, contig_id
+
+            # check for the analyze folder for later use
+            if anaFolder is None:
+                raise ValueError("AnaFolder must not be None for outputFormat 10")
+
+            ### build a dict to link target_names with marker gene alignment information
+            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
+            hitInfo = {}
+            for marker, hit_list in self.markerHits.items():
+                if marker not in markerGenes:
+                    continue
+
+                for hit in hit_list:
+                    name = hit.target_name
+                    hitInfo[name] = {
+                            "marker": marker,
+                            "ali_from": str(hit.ali_from),
+                            "ali_to": str(hit.ali_to)
+                            }
+
+
+            ### Open genes.faa and print the ones that were found with some descriptive info in the header
+            path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
+
+
+            # get only the seqs we need and their information as a dict
+            seqs = readFasta(path_to_genes, trimHeader=False)
+
+            filt_seqs = []
+            # remove seqs without markers
+            for header in seqs.keys():
+                gene_name = header.split(" # ")[0]
+                if gene_name in hitInfo:
+                    filt_seqs.append(header)
+
+
+            def sort_header(header):
+                """ sorts headers by contig and gene number """
+                name = header.split(" # ")[0]
+                ctg_name, gene_num = name.rsplit("_", 1)
+                return ctg_name, int(gene_num)
+
+
+            for header in sorted(filt_seqs, key=sort_header):
+                elems = header.split(" # ")
+                gene_name = elems[0]
+
+                # remove the gene number from Prodigal to get the original contig name
+                contig_name, gene_num = gene_name.rsplit("_", 1)
+
+                # parse some info about the gene from the header line
+                gene_start = elems[1]
+                gene_end = elems[2]
+                gene_strand = elems[3]
+
+                # if table output not specified, print FASTA
+                if table != None:
+                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
+                            gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])))
+
+                    marker_info = "marker={};mstart={};mend={}".format(
+                            hitInfo[gene_name]["marker"], 
+                            hitInfo[gene_name]["ali_from"], 
+                            hitInfo[gene_name]["ali_to"])
+
+                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
+                    new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
+
+                    print(new_header, seqs[header], sep="\n")
+                # otherwise, print a table
+                else:
+                    print("\t".join([
+                            self.binId, 
+                            contig_name,
+                            gene_num,
+                            gene_start,
+                            gene_end,
+                            gene_strand,
+                            str(len(seqs[header])),
+                            hitInfo[gene_name]["marker"],
+                            hitInfo[gene_name]["ali_from"],
+                            hitInfo[gene_name]["ali_to"],
+                            seqs[header]
+                            ]))
+
+
         else:
             self.logger.error("Unknown output format: %d", outputFormat)
 
         return 0
 
         '''
-        elif outputFormat == 9:
+        elif outputFormat == 10:
             markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
 
             markersInScaffold = {}

diff --git a/checkm/util/seqUtils.py b/checkm/util/seqUtils.py
@@ -24,7 +24,7 @@
 import logging
 
 
-def readFasta(fastaFile):
+def readFasta(fastaFile, trimHeader=True):
     '''Read sequences from FASTA file.'''
     try:
         if fastaFile.endswith('.gz'):
@@ -39,7 +39,10 @@ def readFasta(fastaFile):
                 continue
 
             if line[0] == '>':
-                seqId = line[1:].split(None, 1)[0]
+                if trimHeader:
+                    seqId = line[1:].split(None, 1)[0]
+                else:
+                    seqId = line[1:].rstrip()
                 seqs[seqId] = []
             else:
                 seqs[seqId].append(line[0:-1])