Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions bin/checkm
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,10 @@ Example: checkm qa ./output/lineage.ms ./output
5. list of bin id, marker gene id, gene id
6. list of marker genes present multiple times in a bin
7. list of marker genes present multiple times on the same scaffold
8. list indicating position of each marker gene within a bin''',
8. list indicating position of each marker gene within a bin
10. FASTA of marker genes identified in each bin''',
# 9. scaffold statistics: scaffold id, bin id, length, GC, ..., marker gene(s)''',
default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8])
default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8, 10])
qa_parser.add_argument('--exclude_markers', default=None, help="file specifying marker to exclude from marker sets")
qa_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
qa_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")
Expand Down
2 changes: 1 addition & 1 deletion checkm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def qa(self, options):
)

self.logger.info('')
RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file)
RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder)
RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers)

if options.file != '':
Expand Down
115 changes: 106 additions & 9 deletions checkm/resultsParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from checkm.hmmer import HMMERParser

from checkm.util.pfam import PFAM

from checkm.util.seqUtils import readFasta, writeFasta

class ResultsParser():
"""Parse output of Prodigal+HMMER run and derived statistics."""
Expand Down Expand Up @@ -196,7 +196,7 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection):
except IOError as detail:
sys.stderr.write(str(detail) + "\n")

def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None):
"""Get header for requested output table."""

# keep count of single, double, triple genes etc...
Expand Down Expand Up @@ -234,10 +234,15 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None):
header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
elif outputFormat == 9:
header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
elif outputFormat == 10:
if table is not None:
header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence']
else:
header = " "

return header

def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
# redirect output
oldStdOut = reassignStdOut(outFile)

Expand All @@ -246,9 +251,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
coverage = Coverage(1)
coverageBinProfiles = coverage.binProfiles(coverageFile)

prettyTableFormats = [1, 2, 3]
prettyTableFormats = [1, 2, 3, 10]

header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles)
header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable)
if bTabTable or outputFormat not in prettyTableFormats:
bTabTable = True
pTable = None
Expand All @@ -265,7 +270,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke

seqsReported = 0
for binId in sorted(self.results.keys()):
seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)
seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)

if outputFormat in [6, 7] and seqsReported == 0:
print('[No marker genes satisfied the reporting criteria.]')
Expand All @@ -274,7 +279,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke
if outputFormat in [1, 2]:
print(pTable.get_string(sortby='Completeness', reversesort=True))
else:
print(pTable.get_string())
# only print if there are rows
if pTable.get_string(print_empty=False):
print(pTable.get_string(print_empty=False))

# restore stdout
restoreStdOut(outFile, oldStdOut)
Expand Down Expand Up @@ -622,7 +629,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):

return summary

def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None):
def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):

"""Print out information about bin."""
if outputFormat == 1:
selectedMarkerSet = binMarkerSets.selectedMarkerSet()
Expand Down Expand Up @@ -789,13 +797,102 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov
rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
print(rowStr)

# Hunter Cameron, May 29, 2015 - print a fasta of marker genes
elif outputFormat == 10:
# tabular of bin_id, marker, contig_id

# check for the analyze folder for later use
if anaFolder is None:
raise ValueError("AnaFolder must not be None for outputFormat 10")

### build a dict to link target_names with marker gene alignment information
markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
hitInfo = {}
for marker, hit_list in self.markerHits.items():
if marker not in markerGenes:
continue

for hit in hit_list:
name = hit.target_name
hitInfo[name] = {
"marker": marker,
"ali_from": str(hit.ali_from),
"ali_to": str(hit.ali_to)
}


### Open genes.faa and print the ones that were found with some descriptive info in the header
path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])


# get only the seqs we need and their information as a dict
seqs = readFasta(path_to_genes, trimHeader=False)

filt_seqs = []
# remove seqs without markers
for header in seqs.keys():
gene_name = header.split(" # ")[0]
if gene_name in hitInfo:
filt_seqs.append(header)


def sort_header(header):
""" sorts headers by contig and gene number """
name = header.split(" # ")[0]
ctg_name, gene_num = name.rsplit("_", 1)
return ctg_name, int(gene_num)


for header in sorted(filt_seqs, key=sort_header):
elems = header.split(" # ")
gene_name = elems[0]

# remove the gene number from Prodigal to get the original contig name
contig_name, gene_num = gene_name.rsplit("_", 1)

# parse some info about the gene from the header line
gene_start = elems[1]
gene_end = elems[2]
gene_strand = elems[3]

# if table output not specified, print FASTA
if table != None:
gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])))

marker_info = "marker={};mstart={};mend={}".format(
hitInfo[gene_name]["marker"],
hitInfo[gene_name]["ali_from"],
hitInfo[gene_name]["ali_to"])

# new header will be the bin name, contig name, gene info, and marker info separated by spaces
new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])

print(new_header, seqs[header], sep="\n")
# otherwise, print a table
else:
print("\t".join([
self.binId,
contig_name,
gene_num,
gene_start,
gene_end,
gene_strand,
str(len(seqs[header])),
hitInfo[gene_name]["marker"],
hitInfo[gene_name]["ali_from"],
hitInfo[gene_name]["ali_to"],
seqs[header]
]))


else:
self.logger.error("Unknown output format: %d", outputFormat)

return 0

'''
elif outputFormat == 9:
elif outputFormat == 10:
markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()

markersInScaffold = {}
Expand Down
7 changes: 5 additions & 2 deletions checkm/util/seqUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import logging


def readFasta(fastaFile):
def readFasta(fastaFile, trimHeader=True):
'''Read sequences from FASTA file.'''
try:
if fastaFile.endswith('.gz'):
Expand All @@ -39,7 +39,10 @@ def readFasta(fastaFile):
continue

if line[0] == '>':
seqId = line[1:].split(None, 1)[0]
if trimHeader:
seqId = line[1:].split(None, 1)[0]
else:
seqId = line[1:].rstrip()
seqs[seqId] = []
else:
seqs[seqId].append(line[0:-1])
Expand Down