From 8ee5f90a8a1d786e940b9fa05731beff3889f219 Mon Sep 17 00:00:00 2001 From: Hunter Cameron Date: Wed, 3 Jun 2015 11:47:31 -0400 Subject: [PATCH 1/3] Feature: print a FASTA of marker genes for each bin Added the ability to print a FASTA file (amino acid) that includes the sequences of the genes that were identified as marker genes in each bin. Feature is implemented in the qa subprogram as a new output format (number 10). Each sequence is printed with an informative header that takes the form: >$bin_id $contig_name geneId=$gene_num;start=$start;end=$end;strand=$strand marker=$marker;mstart=$mstart;mend=$mend Where $ denotes a variable. Variables: - bin_id: FASTA bin the sequence comes from - contig_name: contig the sequence comes from - gene_num: gene number as assigned by Prodigal - start: gene start position on contig (nucleotide) - end: gene end position on contig (nucleotide) - strand: strand the gene is on, 1 or -1 - marker: name of the marker that was identified as a match to the gene - mstart: start position on gene of alignment with marker (amino acid) - mend: end position on gene of alignment with marker (amino acid) --- checkm/main.py | 2 +- checkm/resultsParser.py | 66 +++++++++++++++++++++++++++++++++++++---- checkm/util/seqUtils.py | 7 +++-- 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/checkm/main.py b/checkm/main.py index dbf3611..b4851ee 100644 --- a/checkm/main.py +++ b/checkm/main.py @@ -393,7 +393,7 @@ def qa(self, options): ) self.logger.info('') - RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file) + RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_folder) RP.cacheResults(options.analyze_folder, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py index 07fd7c0..17ab280 100644 --- a/checkm/resultsParser.py +++ b/checkm/resultsParser.py @@ -35,7 +35,7 @@ from checkm.hmmer import HMMERParser from checkm.util.pfam import PFAM - +from checkm.util.seqUtils import readFasta, writeFasta class ResultsParser(): """Parse output of Prodigal+HMMER run and derived statistics.""" @@ -234,10 +234,12 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None): header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}'] elif outputFormat == 9: header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids'] + elif outputFormat == 10: + header = None return header - def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile): + def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder): # redirect output oldStdOut = reassignStdOut(outFile) @@ -265,7 +267,7 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke seqsReported = 0 for binId in sorted(self.results.keys()): - seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable) + seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) if outputFormat in [6, 7] and seqsReported == 0: print('[No marker genes satisfied the reporting criteria.]') @@ -622,7 +624,8 @@ def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): return summary - def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None): + def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): + """Print out information about bin.""" if outputFormat == 1: selectedMarkerSet = binMarkerSets.selectedMarkerSet() @@ -789,13 +792,66 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to) print(rowStr) + # Hunter Cameron, May 29, 2015 - print a fasta of marker genes + elif outputFormat == 10: + # tabular of bin_id, marker, contig_id + + # check for the analyze folder for later use + if anaFolder is None: + raise ValueError("AnaFolder must not be None for outputFormat 10") + + ### build a dict to link target_names with marker gene alignment information + markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() + hitInfo = {} + for marker, hit_list in self.markerHits.items(): + if marker not in markerGenes: + continue + + for hit in hit_list: + name = hit.target_name + hitInfo[name] = { + "marker": marker, + "ali_from": hit.ali_from, + "ali_to": hit.ali_to + } + + + ### Open genes.faa and print the ones that were found with some descriptive info in the header + path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) + for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems(): + elems = header.split(" # ") + gene_name = elems[0] + if gene_name in hitInfo: + + # remove the gene number from Prodigal to get the original contig name + contig_name, gene_num = gene_name.rsplit("_", 1) + + # parse some info about the gene from the header line + gene_start = elems[1] + gene_end = elems[2] + gene_strand = elems[3] + + gene_info = "geneId={};start={};end={};strand={};protlen={}".format( + gene_num, gene_start, gene_end, gene_strand, str(len(seq))) + + marker_info = "marker={};mstart={};mend={}".format( + hitInfo[gene_name]["marker"], + hitInfo[gene_name]["ali_from"], + hitInfo[gene_name]["ali_to"]) + + # new header will be the bin name, contig name, gene info, and marker info separated by spaces + new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) + + print(new_header, seq, sep="\n") + + else: self.logger.error("Unknown output format: %d", outputFormat) return 0 ''' - elif outputFormat == 9: + elif outputFormat == 10: markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() markersInScaffold = {} diff --git a/checkm/util/seqUtils.py b/checkm/util/seqUtils.py index e083afa..e2efa46 100644 --- a/checkm/util/seqUtils.py +++ b/checkm/util/seqUtils.py @@ -24,7 +24,7 @@ import logging -def readFasta(fastaFile): +def readFasta(fastaFile, trimHeader=True): '''Read sequences from FASTA file.''' try: if fastaFile.endswith('.gz'): @@ -39,7 +39,10 @@ def readFasta(fastaFile): continue if line[0] == '>': - seqId = line[1:].split(None, 1)[0] + if trimHeader: + seqId = line[1:].split(None, 1)[0] + else: + seqId = line[1:].rstrip() seqs[seqId] = [] else: seqs[seqId].append(line[0:-1]) From 0b29f9de2403c9adc813485033619ee07c0c9c4e Mon Sep 17 00:00:00 2001 From: Hunter Cameron Date: Wed, 3 Jun 2015 12:27:44 -0400 Subject: [PATCH 2/3] Added argparse option for out 10 --- bin/checkm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/checkm b/bin/checkm index 1a98f0d..730f021 100755 --- a/bin/checkm +++ b/bin/checkm @@ -247,9 +247,10 @@ Example: checkm qa ./output/lineage.ms ./output 5. list of bin id, marker gene id, gene id 6. list of marker genes present multiple times in a bin 7. list of marker genes present multiple times on the same scaffold - 8. list indicating position of each marker gene within a bin''', + 8. list indicating position of each marker gene within a bin + 10. FASTA of marker genes identified in each bin''', # 9. scaffold statistics: scaffold id, bin id, length, GC, ..., marker gene(s)''', - default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8]) + default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8, 10]) qa_parser.add_argument('--exclude_markers', default=None, help="file specifying marker to exclude from marker sets") qa_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)") qa_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination") From 377205313a53db8c0f420d10cdef99a0ac9234c7 Mon Sep 17 00:00:00 2001 From: Hunter Cameron Date: Thu, 4 Jun 2015 11:58:51 -0400 Subject: [PATCH 3/3] added table output to QA format 10 --- checkm/resultsParser.py | 77 +++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/checkm/resultsParser.py b/checkm/resultsParser.py index 17ab280..df90284 100644 --- a/checkm/resultsParser.py +++ b/checkm/resultsParser.py @@ -196,7 +196,7 @@ def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection): except IOError as detail: sys.stderr.write(str(detail) + "\n") - def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None): + def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None): """Get header for requested output table.""" # keep count of single, double, triple genes etc... @@ -235,7 +235,10 @@ def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None): elif outputFormat == 9: header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids'] elif outputFormat == 10: - header = None + if table is not None: + header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence'] + else: + header = " " return header @@ -248,9 +251,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke coverage = Coverage(1) coverageBinProfiles = coverage.binProfiles(coverageFile) - prettyTableFormats = [1, 2, 3] + prettyTableFormats = [1, 2, 3, 10] - header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles) + header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable) if bTabTable or outputFormat not in prettyTableFormats: bTabTable = True pTable = None @@ -276,7 +279,9 @@ def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarke if outputFormat in [1, 2]: print(pTable.get_string(sortby='Completeness', reversesort=True)) else: - print(pTable.get_string()) + # only print if there are rows + if pTable.get_string(print_empty=False): + print(pTable.get_string(print_empty=False)) # restore stdout restoreStdOut(outFile, oldStdOut) @@ -811,28 +816,49 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov name = hit.target_name hitInfo[name] = { "marker": marker, - "ali_from": hit.ali_from, - "ali_to": hit.ali_to + "ali_from": str(hit.ali_from), + "ali_to": str(hit.ali_to) } ### Open genes.faa and print the ones that were found with some descriptive info in the header path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) - for header, seq in readFasta(path_to_genes, trimHeader=False).iteritems(): + + + # get only the seqs we need and their information as a dict + seqs = readFasta(path_to_genes, trimHeader=False) + + filt_seqs = [] + # remove seqs without markers + for header in seqs.keys(): + gene_name = header.split(" # ")[0] + if gene_name in hitInfo: + filt_seqs.append(header) + + + def sort_header(header): + """ sorts headers by contig and gene number """ + name = header.split(" # ")[0] + ctg_name, gene_num = name.rsplit("_", 1) + return ctg_name, int(gene_num) + + + for header in sorted(filt_seqs, key=sort_header): elems = header.split(" # ") gene_name = elems[0] - if gene_name in hitInfo: - # remove the gene number from Prodigal to get the original contig name - contig_name, gene_num = gene_name.rsplit("_", 1) + # remove the gene number from Prodigal to get the original contig name + contig_name, gene_num = gene_name.rsplit("_", 1) - # parse some info about the gene from the header line - gene_start = elems[1] - gene_end = elems[2] - gene_strand = elems[3] + # parse some info about the gene from the header line + gene_start = elems[1] + gene_end = elems[2] + gene_strand = elems[3] + # if table output not specified, print FASTA + if table != None: gene_info = "geneId={};start={};end={};strand={};protlen={}".format( - gene_num, gene_start, gene_end, gene_strand, str(len(seq))) + gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) marker_info = "marker={};mstart={};mend={}".format( hitInfo[gene_name]["marker"], @@ -842,9 +868,24 @@ def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, cov # new header will be the bin name, contig name, gene info, and marker info separated by spaces new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) - print(new_header, seq, sep="\n") - + print(new_header, seqs[header], sep="\n") + # otherwise, print a table + else: + print("\t".join([ + self.binId, + contig_name, + gene_num, + gene_start, + gene_end, + gene_strand, + str(len(seqs[header])), + hitInfo[gene_name]["marker"], + hitInfo[gene_name]["ali_from"], + hitInfo[gene_name]["ali_to"], + seqs[header] + ])) + else: self.logger.error("Unknown output format: %d", outputFormat)