From c4b5ff435f220257d9871b7fe6c198b245b325ef Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 23 Mar 2023 18:54:00 -0700 Subject: [PATCH 01/21] The SAM_reader class now uses pysam for the initial parsing step. The SamSqValidator class has been updated accordingly. SAM header validation is delegated to pysam's AlignmentHeader.to_dict() method. When the user elects to have decollapsed SAM files produced, alignments are buffered as AlignedSegment objects because they use about 50% of the memory that the string representation does, and much less for the previous tuple representation. On the other hand, using pysam to parse alignments is 3.5x slower in _parse_alignments() and results in an overall runtime increase of about 20%. This might pay off in the form of BAM file support, but after looking over pysam's codebase I don't think there's a lot I can do to speed this up without writing our own in Cython. Ultimately it might be worthwhile to resurrect the old code and just have two separate parsing functions for SAM and BAM --- tiny/rna/counter/hts_parsing.py | 153 +++++++++++++------------------- tiny/rna/counter/validation.py | 9 +- 2 files changed, 68 insertions(+), 94 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 3b28f265..2c6ee3b3 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -1,6 +1,7 @@ import functools import os.path import HTSeq +import pysam import sys import re @@ -22,10 +23,6 @@ Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" _re_fastx = r"seq\d+_x(\d+)" -_re_header = re.compile(r"^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$|^@CO\t.*") - -# For Alignment -complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'} class SAM_reader: @@ -39,24 +36,24 @@ def __init__(self, **prefs): self._collapser_token = None self._decollapsed_filename = None self._decollapsed_reads = [] - self._header_lines = [] - self._header_dict = defaultdict(list) + self._header_dict = {} + self._header = None def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" self.file = file - with open(file, 'rb') as f: - aln_iter = iter(self._parse_alignments(f)) - bundle, read_count = self._new_bundle(next(aln_iter)) - - for aln in aln_iter: - if aln['Name'] != bundle[0]['Name']: - yield bundle, read_count - bundle, read_count = self._new_bundle(aln) - else: - bundle.append(aln) - yield bundle, read_count + pysam_reader = pysam.AlignmentFile(file) + aln_iter = iter(self._parse_alignments(pysam_reader)) + bundle, read_count = self._new_bundle(next(aln_iter)) + + for aln in aln_iter: + if aln['Name'] != bundle[0]['Name']: + yield bundle, read_count + bundle, read_count = self._new_bundle(aln) + else: + bundle.append(aln) + yield bundle, read_count if self.decollapse and len(self._decollapsed_reads): self._write_decollapsed_sam() @@ -72,92 +69,67 @@ def _new_bundle(self, aln: AlignmentDict) -> Bundle: return [aln], count - def _parse_alignments(self, file_obj: IO) -> Iterator[AlignmentDict]: - """Parses and yields individual SAM alignments from the open file_obj""" + def _parse_alignments(self, reader: pysam.AlignmentFile) -> Iterator[AlignmentDict]: + """Yields alignment dictionaries containing relevant info from each pysam.AlignedSegment""" - line, line_no = self._read_to_first_aln(file_obj) - decollapse_sam = self.decollapse - - try: - while line: - line_no += 1 - cols = line.split(b'\t') + self._gather_metadata(reader) + first_line = len(str(self._header).splitlines()) + 1 + complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'} - if decollapse_sam: - self._decollapsed_reads.append((cols[0], line)) - if len(self._decollapsed_reads) > 100000: + for line_no, aln in enumerate(reader, start=first_line): + try: + if self.decollapse: + self._decollapsed_reads.append(aln) + if len(self._decollapsed_reads) > 100_000: self._write_decollapsed_sam() + if aln.is_unmapped: + continue - line = file_obj.readline() # Next line - start = int(cols[3]) - 1 - seq = cols[9] + seq = aln.query_sequence + start = aln.pos length = len(seq) + strand = aln.is_forward - # Note: we assume sRNA sequencing data is NOT reversely stranded - if (int(cols[1]) & 16): - strand = False # - + if strand: + nt5 = seq[0] + else: try: nt5 = complement[seq[-1]] except KeyError: - nt5 = chr(seq[-1]) - else: - strand = True # + - nt5 = chr(seq[0]) + nt5 = seq[-1] + + # Note: we assume sRNA sequencing data is NOT reversely stranded yield { - "Name": cols[0], + "Name": aln.query_name, "Length": length, "Seq": seq, "nt5end": nt5, - "Chrom": cols[2].decode(), + "Chrom": aln.reference_name, "Start": start, "End": start + length, "Strand": strand } - except Exception as e: - # Append to error message while preserving exception provenance and traceback - msg = f"Error occurred on line {line_no} of {self.file}" - append_to_exception(e, msg) - raise - - def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]: - """Advance file_obj past the SAM header and return the first alignment unparsed""" - - lineno = 1 - line = file_obj.readline() - while line[0] == ord('@'): - self._parse_header_line(line.decode('utf-8')) - line = file_obj.readline() - lineno += 1 + except Exception as e: + # Append to error message while preserving exception provenance and traceback + msg = f"Error occurred on line {line_no} of {self.file}" + append_to_exception(e, msg) + raise - self._determine_collapser_type(line.decode()) - if self.decollapse: self._write_header_for_decollapsed_sam() + def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: + """Saves header information, determines which collapser utility (if any) + was used before alignment, and copies the input file's header to the + decollapsed output file if necessary.""" - return line, lineno + header = reader.header + first_aln = next(reader.head(1)) - def _parse_header_line(self, header_line: str) -> None: - """Parses and saves information from the provided header line""" - - self.validate_header_line(header_line) - self._header_lines.append(header_line) - - # Header dict - rec_type = header_line[:3] - fields = header_line[3:].strip().split('\t') - - if rec_type == "@CO": - self._header_dict[rec_type].append(fields[0]) - else: - self._header_dict[rec_type].append( - {field[:2]: field[3:].strip() for field in fields} - ) + self._header = header + self._header_dict = header.to_dict() # performs validation + self._determine_collapser_type(first_aln.query_name) - def validate_header_line(self, line): - if not re.match(_re_header, line): - msg = "Invalid SAM header" - msg += f" in {os.path.basename(self.file)}" if self.file else "" - msg += ':\n\t' + f'"{line}"' - raise ValueError(msg) + if self.decollapse: + self._write_header_for_decollapsed_sam(str(header)) def _determine_collapser_type(self, first_aln_line: str) -> None: """Attempts to determine the collapsing utility that was used before producing the @@ -169,7 +141,7 @@ def _determine_collapser_type(self, first_aln_line: str) -> None: elif re.match(_re_fastx, first_aln_line) is not None: self.collapser_type = "fastx" - sort_order = self._header_dict.get('@HD', [{}])[0].get('SO', None) + sort_order = self._header_dict.get('HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") @@ -189,12 +161,12 @@ def _get_decollapsed_filename(self) -> str: self._decollapsed_filename = make_filename([basename, "decollapsed"], ext='.sam') return self._decollapsed_filename - def _write_header_for_decollapsed_sam(self) -> None: - """Writes the input file's header lines to the decollapsed output file""" + def _write_header_for_decollapsed_sam(self, header_str) -> None: + """Writes the provided header to the decollapsed output file""" assert self.collapser_type is not None with open(self._get_decollapsed_filename(), 'w') as f: - f.writelines(self._header_lines) + f.write(header_str) def _write_decollapsed_sam(self) -> None: """Expands the collapsed alignments in the _decollapsed_reads buffer @@ -202,12 +174,13 @@ def _write_decollapsed_sam(self) -> None: assert self.collapser_type is not None aln_out, prev_name, seq_count = [], None, 0 - for name, line in self._decollapsed_reads: + for aln in self._decollapsed_reads: # Parse count just once per multi-alignment + name = aln.query_name if name != prev_name: seq_count = int(name.split(self.collapser_token)[-1]) - aln_out.extend([line] * seq_count) + aln_out.extend([aln.to_string()] * seq_count) prev_name = name with open(self._get_decollapsed_filename(), 'ab') as sam_o: @@ -220,9 +193,9 @@ def collapser_token(self) -> bytes: if self._collapser_token is None: self._collapser_token = { - "tiny-collapse": b"=", - "fastx": b"_x", - "BioSeqZip": b":" + "tiny-collapse": "=", + "fastx": "_x", + "BioSeqZip": ":" }[self.collapser_type] return self._collapser_token diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 43fd2af5..58b67d0b 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -1,5 +1,6 @@ import functools import subprocess +import pysam import sys import os @@ -341,8 +342,8 @@ def generate_identifier_report(self, duplicate, ambiguous): def read_sq_headers(self): for file in self.sam_files: - with open(file, 'rb') as f: - reader = SAM_reader() - reader._read_to_first_aln(f) + sr = SAM_reader() + pr = pysam.AlignmentFile(file, check_sq=True) + sr._gather_metadata(pr) - self.sq_headers[file] = reader._header_dict.get('@SQ', []) \ No newline at end of file + self.sq_headers[file] = sr._header_dict.get('SQ', []) \ No newline at end of file From 50af6e30e0c0a86e40d8f18d0e73c74a9b72a5a5 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 31 Mar 2023 14:10:11 -0700 Subject: [PATCH 02/21] Adding some optimizations to the pure-python implementation of _parse_alignments(). It's better but not quite enough. Need to move this function into Cython space for any further gains --- tiny/rna/counter/hts_parsing.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 2c6ee3b3..60afaec2 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -73,22 +73,26 @@ def _parse_alignments(self, reader: pysam.AlignmentFile) -> Iterator[AlignmentDi """Yields alignment dictionaries containing relevant info from each pysam.AlignedSegment""" self._gather_metadata(reader) + decollapse, has_nm = self.decollapse, self.has_nm first_line = len(str(self._header).splitlines()) + 1 complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'} + aln: pysam.AlignedSegment for line_no, aln in enumerate(reader, start=first_line): try: - if self.decollapse: + if decollapse: self._decollapsed_reads.append(aln) if len(self._decollapsed_reads) > 100_000: self._write_decollapsed_sam() - if aln.is_unmapped: - continue + + flag = aln.flag + if flag & 0x4: + continue # Unmapped seq = aln.query_sequence - start = aln.pos - length = len(seq) - strand = aln.is_forward + start = aln.reference_start + length = aln.query_length + strand = not (flag & 16) # Note: we assume sRNA sequencing data is NOT reversely stranded if strand: nt5 = seq[0] @@ -98,17 +102,26 @@ def _parse_alignments(self, reader: pysam.AlignmentFile) -> Iterator[AlignmentDi except KeyError: nt5 = seq[-1] - # Note: we assume sRNA sequencing data is NOT reversely stranded + if has_nm: + try: + mismatches = aln.get_tag(b"NM") + except KeyError: + # If the first alignment had an NM tag, assume missing tag means NM:i:0 + mismatches = 0 + else: + # Calculate mismatches using the CIGAR string's I, D, and X operations + mismatches = sum(length for op, length in aln.cigartuples if op in (1, 2, 8)) yield { "Name": aln.query_name, "Length": length, "Seq": seq, "nt5end": nt5, - "Chrom": aln.reference_name, + "Chrom": self.references[aln.reference_id], "Start": start, "End": start + length, - "Strand": strand + "Strand": strand, + "Mismatches": mismatches } except Exception as e: # Append to error message while preserving exception provenance and traceback @@ -127,6 +140,8 @@ def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: self._header = header self._header_dict = header.to_dict() # performs validation self._determine_collapser_type(first_aln.query_name) + self.has_nm = first_aln.has_tag("NM") + self.references = header.references if self.decollapse: self._write_header_for_decollapsed_sam(str(header)) From 7edae91c0c4d618979c78e049ca09b559a607759 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 31 Mar 2023 14:24:43 -0700 Subject: [PATCH 03/21] The _parse_alignments() method of SAM_reader has been replaced by a dedicated Cython extension class. The extension acts as an iterable over pysam alignments that have been converted to dictionary form on the fly. Importantly, it allows us to eliminate nearly all calls to the Python-space API for pysam and instead retrieve information directly from Cython space. It is much faster. The runtime cost of switching to pysam is now a mere 5% slowdown (down from 20-30% initially). In the spirit of minimizing Cython footprint due to debugging complications, the extension class still accumulates alignments for decollapsing, but uses a callback method from SAM_reader's Python space to actually write the decollapsed alignments --- setup.py | 12 ++-- tiny/rna/counter/hts_parsing.py | 74 +++---------------- tiny/rna/counter/parsing/__init__.py | 1 + tiny/rna/counter/parsing/alignments.pyx | 95 +++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 67 deletions(-) create mode 100644 tiny/rna/counter/parsing/__init__.py create mode 100644 tiny/rna/counter/parsing/alignments.pyx diff --git a/setup.py b/setup.py index dd1d966d..72089139 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import os import sys +import pysam import setuptools from setuptools.command.install import install @@ -40,9 +41,10 @@ def get_cython_extension_defs(): error out if there are build issues, and therefore must be used as optional imports.""" pyx_files = [ - # (file path, optional) - ('tiny/rna/counter/stepvector/_stepvector.pyx', True), - ('tests/cython_tests/stepvector/test_cython.pyx', True) + # (file path, optional, include) + ('tiny/rna/counter/stepvector/_stepvector.pyx', True, []), + ('tests/cython_tests/stepvector/test_cython.pyx', True, []), + ('tiny/rna/counter/parsing/alignments.pyx', False, pysam.get_include()) ] cxx_extension_args = { @@ -61,9 +63,10 @@ def get_cython_extension_defs(): return [setuptools.Extension( pyx_filename.replace('./', '').replace('/', '.').rstrip('.pyx'), sources=[pyx_filename], + include_dirs=include, optional=optional, **cxx_extension_args) - for pyx_filename, optional in pyx_files] + for pyx_filename, optional, include in pyx_files] def get_macos_sdk_path(): @@ -120,6 +123,7 @@ def get_macos_sdk_path(): ext_modules=cythonize( get_cython_extension_defs(), compiler_directives={'language_level': '3'}, + include_path=pysam.get_include(), gdb_debug=False ), scripts=scripts, diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 60afaec2..0692c3b5 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -11,6 +11,7 @@ from urllib.parse import unquote from inspect import stack +from rna.counter.parsing.alignments import AlignmentIter from tiny.rna.counter.matching import Wildcard from tiny.rna.util import report_execution_time, make_filename, ReportFormatter, append_to_exception @@ -32,19 +33,26 @@ def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) self.out_prefix = prefs.get("out_prefix", None) self.collapser_type = None + self.references = [] + self.has_nm = None self.file = None + self._collapser_token = None - self._decollapsed_filename = None - self._decollapsed_reads = [] self._header_dict = {} self._header = None + self._decollapsed_callback = self._write_decollapsed_sam if self.decollapse else None + self._decollapsed_filename = None + self._decollapsed_reads = [] + def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" self.file = file pysam_reader = pysam.AlignmentFile(file) - aln_iter = iter(self._parse_alignments(pysam_reader)) + self._gather_metadata(pysam_reader) + + aln_iter = AlignmentIter(pysam_reader, self.has_nm, self._decollapsed_callback, self._decollapsed_reads) bundle, read_count = self._new_bundle(next(aln_iter)) for aln in aln_iter: @@ -69,66 +77,6 @@ def _new_bundle(self, aln: AlignmentDict) -> Bundle: return [aln], count - def _parse_alignments(self, reader: pysam.AlignmentFile) -> Iterator[AlignmentDict]: - """Yields alignment dictionaries containing relevant info from each pysam.AlignedSegment""" - - self._gather_metadata(reader) - decollapse, has_nm = self.decollapse, self.has_nm - first_line = len(str(self._header).splitlines()) + 1 - complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'} - aln: pysam.AlignedSegment - - for line_no, aln in enumerate(reader, start=first_line): - try: - if decollapse: - self._decollapsed_reads.append(aln) - if len(self._decollapsed_reads) > 100_000: - self._write_decollapsed_sam() - - flag = aln.flag - if flag & 0x4: - continue # Unmapped - - seq = aln.query_sequence - start = aln.reference_start - length = aln.query_length - strand = not (flag & 16) # Note: we assume sRNA sequencing data is NOT reversely stranded - - if strand: - nt5 = seq[0] - else: - try: - nt5 = complement[seq[-1]] - except KeyError: - nt5 = seq[-1] - - if has_nm: - try: - mismatches = aln.get_tag(b"NM") - except KeyError: - # If the first alignment had an NM tag, assume missing tag means NM:i:0 - mismatches = 0 - else: - # Calculate mismatches using the CIGAR string's I, D, and X operations - mismatches = sum(length for op, length in aln.cigartuples if op in (1, 2, 8)) - - yield { - "Name": aln.query_name, - "Length": length, - "Seq": seq, - "nt5end": nt5, - "Chrom": self.references[aln.reference_id], - "Start": start, - "End": start + length, - "Strand": strand, - "Mismatches": mismatches - } - except Exception as e: - # Append to error message while preserving exception provenance and traceback - msg = f"Error occurred on line {line_no} of {self.file}" - append_to_exception(e, msg) - raise - def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: """Saves header information, determines which collapser utility (if any) was used before alignment, and copies the input file's header to the diff --git a/tiny/rna/counter/parsing/__init__.py b/tiny/rna/counter/parsing/__init__.py new file mode 100644 index 00000000..38f362d1 --- /dev/null +++ b/tiny/rna/counter/parsing/__init__.py @@ -0,0 +1 @@ +from .sam_reader import AlignmentIter \ No newline at end of file diff --git a/tiny/rna/counter/parsing/alignments.pyx b/tiny/rna/counter/parsing/alignments.pyx new file mode 100644 index 00000000..c335b1b4 --- /dev/null +++ b/tiny/rna/counter/parsing/alignments.pyx @@ -0,0 +1,95 @@ +# cython: language_level = 3 +# cython: profile=False + +cimport cython +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libcalignedsegment cimport pysam_bam_get_qname +from pysam.libchtslib cimport bam_aux_get, bam_aux2i, BAM_FUNMAP, BAM_FREVERSE + +cdef tuple cigar_mismatch = (1, 2, 8) + +cdef class AlignmentIter: + + cdef AlignmentFile reader + cdef tuple references + cdef object dc_callback + cdef list dc_queue + cdef bint decollapse + cdef bint has_nm + + def __init__(self, pysam_reader, has_nm, dc_callback, dc_queue): + self.reader = pysam_reader + self.references = pysam_reader.header.references + self.decollapse = dc_callback is not None + self.dc_callback = dc_callback + self.dc_queue = dc_queue + self.has_nm = has_nm + + def __iter__(self): + return self + + def __next__(self): + return self._next_alignment() + + cdef dict _next_alignment(self): + cdef: + AlignedSegment aln = next(self.reader) # Equivalent API calls: + int flag = aln._delegate.core.flag # aln.flag + str seq = aln.query_sequence # aln.query_sequence + int start = aln._delegate.core.pos # aln.reference_start + int length = aln._delegate.core.l_qseq # aln.query_length + bint strand = (flag & BAM_FREVERSE) == 0 # aln.is_forward + str nt5 = self._get_nt5(seq, strand) + str name = pysam_bam_get_qname(aln._delegate).decode() # aln.query_name + + if (flag & BAM_FUNMAP) != 0: # aln.is_unmapped + return self._next_alignment() + + if self.decollapse: + self._append_to_dc_queue(aln) + + if self.has_nm: + mismatches = AlignmentIter._get_mismatches_from_nm(aln) # aln.get_tag("NM") + else: + mismatches = AlignmentIter._get_mismatches_from_cigar(aln) + + return { + "Name": name, + "Length": length, + "Seq": seq, + "nt5end": nt5, + "Chrom": self.references[aln._delegate.core.tid], # aln.reference_name + "Start": start, + "End": start + length, + "Strand": strand, + "NM": mismatches + } + + @staticmethod + cdef _get_mismatches_from_nm(AlignedSegment aln): + nm = bam_aux_get(aln._delegate, b"NM") + if nm == NULL: return 0 # Assume missing tag means NM:i:0 + return bam_aux2i(nm) + + @staticmethod + cdef _get_mismatches_from_cigar(AlignedSegment aln): + # Calculate mismatches using the CIGAR string's I, D, and X operations + return sum(op_len for op, op_len in aln.cigartuples if op in cigar_mismatch) + + cdef str _get_nt5(self, str seq, bint strand): + cdef str nt + + if strand: + return seq[0] + else: + nt = seq[-1] + if nt == "A": return "T" + elif nt == "T": return "A" + elif nt == "G": return "C" + elif nt == "C": return "G" + else: return nt + + cdef _append_to_dc_queue(self, AlignedSegment aln): + self.dc_queue.append(aln) + if len(self.dc_queue) > 100_000: + self.dc_callback() \ No newline at end of file From e8b897fe0651ab2eeddb5de8a99a7a919479692b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 31 Mar 2023 14:40:50 -0700 Subject: [PATCH 04/21] The Mismatches selector has been added to tiny-count, the CSVReacer, and the Features Sheet. An additional class has not been added to matching.py for this selector because it is most efficiently achieved using the existing NumericalMatch class. The Mismatch selector is embedded in the feature's match tuples alongside the Overlap selector. It is unpacked from the match tuple and evaluated in Stage 2 selection. --- START_HERE/features.csv | 14 +++++++------- tiny/rna/configuration.py | 13 +++++++++++++ tiny/rna/counter/features.py | 7 ++++--- tiny/rna/counter/hts_parsing.py | 4 ++-- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/START_HERE/features.csv b/START_HERE/features.csv index 012e1130..9b053b11 100644 --- a/START_HERE/features.csv +++ b/START_HERE/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap -Class,mask,masked,,,1,both,all,all,Partial -Class,miRNA,miRNA,,,2,sense,all,16-24,5' anchored -Class,piRNA,piRNA-5'A,,,2,both,A,24-32,Nested -Class,piRNA,piRNA-5'T,,,2,both,T,24-32,Nested -Class,siRNA,siRNA,,,2,both,all,15-22,Nested -Class,unk,unknown,,,3,both,all,all,Nested +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length +Class,mask,masked,,,1,Partial,,both,all,all +Class,miRNA,miRNA,,,2,5' anchored,,sense,all,16-24 +Class,piRNA,piRNA-5'A,,,2,Nested,,both,A,24-32 +Class,piRNA,piRNA-5'T,,,2,Nested,,both,T,24-32 +Class,siRNA,siRNA,,,2,Nested,,both,all,15-22 +Class,unk,unknown,,,3,Nested,,both,all,all \ No newline at end of file diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 2f0308da..995f6d9f 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -771,6 +771,7 @@ class CSVReader(csv.DictReader): "5' End Nucleotide": "nt5end", "Length": "Length", "Overlap": "Overlap", + "Mismatches": "Mismatch" }), "Samples Sheet": OrderedDict({ "FASTQ/SAM Files": "File", @@ -858,6 +859,10 @@ def validate_csv_header(self, header: OrderedDict): self.fieldnames = tuple(doc_ref_lowercase[key] for key in header_lowercase.values()) def check_backward_compatibility(self, header_vals): + """Catch column differences from old versions so a helpful error can be provided""" + + assert all(val.isupper() for val in header_vals), "Lowercase headers expected." + compat_errors = [] if self.doctype == "Features Sheet": if len(header_vals & {'alias by...', 'feature source'}): @@ -888,6 +893,14 @@ def check_backward_compatibility(self, header_vals): '"Classify as..." to avoid this error.' ])) + if 'mismatches' not in header_vals: + compat_errors.append('\n'.join([ + "It looks like you're using a Features Sheet from an earlier version of", + 'tinyRNA. An additional column, "Mismatches", is now expected. Please review' + "the Stage 2 section in tiny-count's documentation for more info, then add" + "the new column to your Features Sheet to avoid this error." + ])) + if compat_errors: raise ValueError('\n\n'.join(compat_errors)) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 5eb40d16..517f61d1 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -136,8 +136,8 @@ def choose(cls, candidates: Set[feature_record_tuple], alignment: dict) -> Mappi # Stage 2 hits = [(rank, rule, feat, strand) for feat, strand, matches in candidates - for rule, rank, iv_match in matches - if alignment in iv_match] + for rule, rank, iv_match, nm_match in matches + if alignment in iv_match and alignment['NM'] in nm_match] if not hits: return {} hits.sort(key=lambda x: x[0]) @@ -178,6 +178,7 @@ def build_selectors(rules_table) -> List[dict]: "Strand": StrandMatch, "nt5end": NtMatch, "Length": NumericalMatch, + "Mismatch": NumericalMatch, "Identity": lambda x:x } @@ -261,7 +262,7 @@ def build_interval_selectors(self, iv: 'HTSeq.GenomicInterval', match_tuples: Li continue # Replace the match tuple's definition with selector - built_match_tuple = (match[0], match[1], selector_obj) + built_match_tuple = (match[0], match[1], selector_obj, match[3]) matches_by_interval[match_iv].append(built_match_tuple) return matches_by_interval diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 0692c3b5..a2d4ac22 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -561,10 +561,10 @@ def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: if self.column_filter_match(row, rule): # Unclassified matches are pooled under '' empty string identity_matches[rule['Class']].add( - (index, rule['Hierarchy'], rule['Overlap']) + (index, rule['Hierarchy'], rule['Overlap'], rule['Mismatch']) ) - # -> identity_matches: {classifier: {(rule, rank, overlap), ...}, ...} + # -> identity_matches: {classifier: {(rule, rank, overlap, mismatch), ...}, ...} return identity_matches def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: From 34801f2827277dc1dd84fe071e47f1d08b9a2dc6 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 15:45:41 -0700 Subject: [PATCH 05/21] Fixing package import issues for tiny.rna.counter.parsing and adding type annotations to the AlignmentIter constructor --- tiny/rna/counter/parsing/__init__.py | 2 +- tiny/rna/counter/parsing/alignments.pxd | 0 tiny/rna/counter/parsing/alignments.pyx | 4 +++- 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 tiny/rna/counter/parsing/alignments.pxd diff --git a/tiny/rna/counter/parsing/__init__.py b/tiny/rna/counter/parsing/__init__.py index 38f362d1..9c7015c7 100644 --- a/tiny/rna/counter/parsing/__init__.py +++ b/tiny/rna/counter/parsing/__init__.py @@ -1 +1 @@ -from .sam_reader import AlignmentIter \ No newline at end of file +from .alignments import AlignmentIter \ No newline at end of file diff --git a/tiny/rna/counter/parsing/alignments.pxd b/tiny/rna/counter/parsing/alignments.pxd new file mode 100644 index 00000000..e69de29b diff --git a/tiny/rna/counter/parsing/alignments.pyx b/tiny/rna/counter/parsing/alignments.pyx index c335b1b4..0b7fbb48 100644 --- a/tiny/rna/counter/parsing/alignments.pyx +++ b/tiny/rna/counter/parsing/alignments.pyx @@ -6,6 +6,8 @@ from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment from pysam.libcalignedsegment cimport pysam_bam_get_qname from pysam.libchtslib cimport bam_aux_get, bam_aux2i, BAM_FUNMAP, BAM_FREVERSE +from typing import Callable + cdef tuple cigar_mismatch = (1, 2, 8) cdef class AlignmentIter: @@ -17,7 +19,7 @@ cdef class AlignmentIter: cdef bint decollapse cdef bint has_nm - def __init__(self, pysam_reader, has_nm, dc_callback, dc_queue): + def __init__(self, pysam_reader: AlignmentFile, has_nm: bool, dc_callback: Callable, dc_queue: list): self.reader = pysam_reader self.references = pysam_reader.header.references self.decollapse = dc_callback is not None From f2ad1a9c70b270eeae3a638ff5d6696c82584e39 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:05:04 -0700 Subject: [PATCH 06/21] - Small refactors to make unit testing a little easier - QNAME splitting with the collapser token is slightly more reliable - Updating type hints --- tiny/rna/counter/hts_parsing.py | 47 +++++++++++++-------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index a2d4ac22..b18c4fd0 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -11,7 +11,7 @@ from urllib.parse import unquote from inspect import stack -from rna.counter.parsing.alignments import AlignmentIter +from tiny.rna.counter.parsing import AlignmentIter from tiny.rna.counter.matching import Wildcard from tiny.rna.util import report_execution_time, make_filename, ReportFormatter, append_to_exception @@ -20,7 +20,7 @@ _re_attr_empty = re.compile(r"^\s*$") # For SAM_reader -AlignmentDict = Dict[str, Union[str, int, bytes]] +AlignmentDict = Dict[str, Union[str, int]] Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" _re_fastx = r"seq\d+_x(\d+)" @@ -40,6 +40,7 @@ def __init__(self, **prefs): self._collapser_token = None self._header_dict = {} self._header = None + self._iter = None self._decollapsed_callback = self._write_decollapsed_sam if self.decollapse else None self._decollapsed_filename = None @@ -48,14 +49,14 @@ def __init__(self, **prefs): def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" - self.file = file pysam_reader = pysam.AlignmentFile(file) - self._gather_metadata(pysam_reader) + self.file = file - aln_iter = AlignmentIter(pysam_reader, self.has_nm, self._decollapsed_callback, self._decollapsed_reads) - bundle, read_count = self._new_bundle(next(aln_iter)) + self._gather_metadata(pysam_reader) + self._iter = AlignmentIter(pysam_reader, self.has_nm, self._decollapsed_callback, self._decollapsed_reads) + bundle, read_count = self._new_bundle(next(self._iter)) - for aln in aln_iter: + for aln in self._iter: if aln['Name'] != bundle[0]['Name']: yield bundle, read_count bundle, read_count = self._new_bundle(aln) @@ -70,17 +71,16 @@ def _new_bundle(self, aln: AlignmentDict) -> Bundle: """Wraps the provided alignment in a list and reports the read's count""" if self.collapser_type is not None: - token = self.collapser_token - count = int(aln['Name'].split(token)[-1]) + token = self._collapser_token + count = int(aln['Name'].rsplit(token, 1)[1]) else: count = 1 return [aln], count def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: - """Saves header information, determines which collapser utility (if any) - was used before alignment, and copies the input file's header to the - decollapsed output file if necessary.""" + """Saves header information, examines the first alignment to determine which collapser utility was used (if any), and whether the alignment has a query sequence and NM tag. + The input file's header is written to the decollapsed output file if necessary.""" header = reader.header first_aln = next(reader.head(1)) @@ -94,15 +94,17 @@ def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: if self.decollapse: self._write_header_for_decollapsed_sam(str(header)) - def _determine_collapser_type(self, first_aln_line: str) -> None: + def _determine_collapser_type(self, qname: str) -> None: """Attempts to determine the collapsing utility that was used before producing the input alignment file, then checks basic requirements for the utility's outputs.""" - if re.match(_re_tiny, first_aln_line) is not None: + if re.match(_re_tiny, qname) is not None: self.collapser_type = "tiny-collapse" + self._collapser_token = "=" - elif re.match(_re_fastx, first_aln_line) is not None: + elif re.match(_re_fastx, qname) is not None: self.collapser_type = "fastx" + self._collapser_token = "_x" sort_order = self._header_dict.get('HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": @@ -141,7 +143,7 @@ def _write_decollapsed_sam(self) -> None: # Parse count just once per multi-alignment name = aln.query_name if name != prev_name: - seq_count = int(name.split(self.collapser_token)[-1]) + seq_count = int(name.rsplit(self._collapser_token, 1)[1]) aln_out.extend([aln.to_string()] * seq_count) prev_name = name @@ -150,19 +152,6 @@ def _write_decollapsed_sam(self) -> None: sam_o.writelines(aln_out) self._decollapsed_reads.clear() - @property - def collapser_token(self) -> bytes: - """Returns the split token to be used for determining read count from the QNAME field""" - - if self._collapser_token is None: - self._collapser_token = { - "tiny-collapse": "=", - "fastx": "_x", - "BioSeqZip": ":" - }[self.collapser_type] - - return self._collapser_token - def infer_strandedness(sam_file: str, intervals: dict) -> str: """Infers strandedness from a sample SAM file and intervals from a parsed GFF file From 49347937b9c805e2e06240c4b5e1e4c7ba001a10 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:06:18 -0700 Subject: [PATCH 07/21] Adding a check to _gather_metadata(). The first alignment is inspected to make sure it has a read sequence. If it doesn't it's an error --- tiny/rna/counter/hts_parsing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index b18c4fd0..1717f42a 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -85,6 +85,9 @@ def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: header = reader.header first_aln = next(reader.head(1)) + if first_aln.query_sequence is None: + raise ValueError("SAM alignments must include the read sequence.") + self._header = header self._header_dict = header.to_dict() # performs validation self._determine_collapser_type(first_aln.query_name) @@ -459,9 +462,9 @@ class ReferenceFeatures(ReferenceBase): Children of the root ancestor are otherwise not stored in the reference tables. Match-tuples are created for each Features Sheet rule which matches a feature's attributes. - They are structured as (rank, rule, overlap). "Rank" is the hierarchy value of the matching - rule, "rule" is the index of that rule in FeatureSelector's rules table, and "overlap" is the - IntervalSelector per the rule's overlap requirements. + They are structured as (rank, rule, overlap, mismatches). "Rank" is the hierarchy value of + the matching rule, "rule" is the index of that rule in FeatureSelector's rules table, and + "overlap" is the IntervalSelector per the rule's overlap requirements. Source and type filters allow the user to define acceptable values for columns 2 and 3 of the GFF, respectively. These filters are inclusive (only rows with matching values are parsed), From 60bdbda6a11d96c164b9337face22c0ce39af413 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:08:06 -0700 Subject: [PATCH 08/21] Updating SamSqValidator.read_sq_headers() to use pysam directly (not necessary to use SAM_reader here) --- tiny/rna/counter/validation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 58b67d0b..a55b488e 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -342,8 +342,5 @@ def generate_identifier_report(self, duplicate, ambiguous): def read_sq_headers(self): for file in self.sam_files: - sr = SAM_reader() - pr = pysam.AlignmentFile(file, check_sq=True) - sr._gather_metadata(pr) - - self.sq_headers[file] = sr._header_dict.get('SQ', []) \ No newline at end of file + pr = pysam.AlignmentFile(file, check_sq=False) + self.sq_headers[file] = pr.header.to_dict().get('SQ', []) From 035f4b513b15e65a1df163e19286e9e97b9a8180 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:10:22 -0700 Subject: [PATCH 09/21] Backtracking on a recent change to CSVReader.check_backward_compatibility() --- tiny/rna/configuration.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 995f6d9f..20c49984 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -861,11 +861,11 @@ def validate_csv_header(self, header: OrderedDict): def check_backward_compatibility(self, header_vals): """Catch column differences from old versions so a helpful error can be provided""" - assert all(val.isupper() for val in header_vals), "Lowercase headers expected." + header_vals_lc = {val.lower() for val in header_vals} compat_errors = [] if self.doctype == "Features Sheet": - if len(header_vals & {'alias by...', 'feature source'}): + if len(header_vals_lc & {'alias by...', 'feature source'}): compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", "tinyRNA. Feature aliases and GFF files are now defined in the Paths File.", @@ -874,7 +874,7 @@ def check_backward_compatibility(self, header_vals): "your Features Sheet to avoid this error." ])) - if len(header_vals & {'source filter', 'type filter'}) != 2: + if len(header_vals_lc & {'source filter', 'type filter'}) != 2: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", "tinyRNA. Source and type filters are now defined in the Features Sheet.", @@ -883,7 +883,7 @@ def check_backward_compatibility(self, header_vals): '"Source Filter" and "Type Filter" to your Features Sheet to avoid this error.' ])) - if 'tag' in header_vals: + if 'tag' in header_vals_lc: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from a version of tinyRNA", 'that offered "tagged counting". The "Tag" header has been repurposed as a feature', @@ -893,7 +893,7 @@ def check_backward_compatibility(self, header_vals): '"Classify as..." to avoid this error.' ])) - if 'mismatches' not in header_vals: + if 'mismatches' not in header_vals_lc: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", 'tinyRNA. An additional column, "Mismatches", is now expected. Please review' From 80aac9d25c616eeaaf9fc1664e063afffc5f4132 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:11:00 -0700 Subject: [PATCH 10/21] Updates for type hints and docstrings --- tiny/rna/counter/features.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 517f61d1..512292bd 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -2,17 +2,19 @@ import sys from collections import defaultdict -from typing import List, Tuple, Set, Dict, Mapping +from typing import List, Tuple, Set, Dict, Mapping, Union -from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceFeatures, ReferenceSeqs +from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, SAM_reader from .statistics import LibraryStats from .matching import * from ..util import append_to_exception # Type aliases for human readability -match_tuple = Tuple[int, int, IntervalSelector] # (rank, rule, IntervalSelector) -unbuilt_match_tuple = Tuple[int, int, str] # (rank, rule, interval selector keyword) -feature_record_tuple = Tuple[str, str, Tuple[match_tuple]] # (feature ID, strand, match tuple) +interval_selector = Union[IntervalSelector, Wildcard] +mismatch_selector = Union[NumericalMatch, Wildcard] +match_tuple = Tuple[int, int, interval_selector, mismatch_selector] # (rank, rule, IntervalSelector, NumericalMatch) +unbuilt_match_tuple = Tuple[int, int, str, mismatch_selector] # (rank, rule, overlap, NumericalMatch) +feature_record_tuple = Tuple[str, str, Tuple[match_tuple]] # (feature ID, strand, match tuple) class Features(metaclass=Singleton): @@ -89,14 +91,12 @@ class FeatureSelector: The first round of selection was performed during GFF parsing. - The second round is performed against the hierarchy values and - IntervalSelectors in each candidate's match-tuples. + The second round is performed against the overlap and mismatch selectors + which are stored in each candidate's match tuples. These matches are then + sorted by hierarchy value. If more than one candidate remains, a final round of selection is performed - against sequence attributes: strand, 5' end nucleotide, and length. Rules for - 5' end nucleotides support lists (e.g. C,G,U) and wildcards (e.g. "all"). - Rules for length support lists, wildcards, and ranges (e.g. 20-27) which - may be intermixed in the same rule. + against sequence attributes: strand, 5' end nucleotide, and length. """ rules_table: List[dict] @@ -122,7 +122,7 @@ def choose(cls, candidates: Set[feature_record_tuple], alignment: dict) -> Mappi ( featureID, strand, ( match_tuple, ... ) ) Each match_tuple represents a rule which matched the feature on identity: - ( rule, rank, IntervalSelector ) + ( rule, rank, IntervalSelector, NumericalSelector ) Args: candidates: a list of tuples, each representing features associated with From 01406845f9cb476a5af6173762e99c5840bbbd05 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:13:03 -0700 Subject: [PATCH 11/21] Updating Features Sheets in testdata and template folders. Mismatches column. Column order now matches current selection diagram. --- tests/testdata/config_files/features.csv | 14 +++++++------- tiny/templates/features.csv | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/testdata/config_files/features.csv b/tests/testdata/config_files/features.csv index 8a6e12d1..1ad42073 100755 --- a/tests/testdata/config_files/features.csv +++ b/tests/testdata/config_files/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap -Class,mask,,,,1,both,all,all,Partial -Class,miRNA,,,,2,sense,all,16-22,Nested -Class,piRNA,5pA,,,2,both,A,24-32,Nested -Class,piRNA,5pT,,,2,both,T,24-32,Nested -Class,siRNA,,,,2,both,all,15-22,Nested -Class,unk,,,,3,both,all,all,Nested \ No newline at end of file +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length +Class,mask,,,,1,Partial,0,both,all,all +Class,miRNA,,,,2,Nested,0,sense,all,16-22 +Class,piRNA,5pA,,,2,Nested,0,both,A,24-32 +Class,piRNA,5pT,,,2,Nested,0,both,T,24-32 +Class,siRNA,,,,2,Nested,0,both,all,15-22 +Class,unk,,,,3,Nested,0,both,all,all \ No newline at end of file diff --git a/tiny/templates/features.csv b/tiny/templates/features.csv index 8ced3c8d..067f78dd 100755 --- a/tiny/templates/features.csv +++ b/tiny/templates/features.csv @@ -1,2 +1,2 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length ,,,,,,,,, \ No newline at end of file From d6fd8f41fb8f4a266a69779b251167a358507eac Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:15:05 -0700 Subject: [PATCH 12/21] Updating unit tests for the new Features Sheet and match tuple formats --- tests/unit_test_helpers.py | 12 +++++++----- tests/unit_tests_counter.py | 6 ++++-- tests/unit_tests_features.py | 22 +++++++++++----------- tests/unit_tests_matching.py | 18 +++++++++--------- 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index 3bbc6579..0b6aaf45 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -22,9 +22,10 @@ 'Filter_t': "", 'Strand': "both", 'Hierarchy': 0, + 'Overlap': "partial", + 'Mismatch': "", 'nt5end': "all", - 'Length': "all", # A string is expected by FeatureSelector due to support for lists and ranges - 'Overlap': "partial"}] + 'Length': "all",}] # A string is expected by FeatureSelector due to support for lists and ranges def csv_factory(type: str, rows: List[dict], header=()): @@ -130,7 +131,7 @@ def get_dir_checksum_tree(root_path: str) -> dict: return dir_tree -def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand:bool = True): +def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand=True, NM=0): return { "Name": Name, "Length": len(Seq), @@ -139,7 +140,8 @@ def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom= "Chrom": Chrom, "Start": Start, "End": Start + len(Seq), - "Strand": Strand + "Strand": Strand, + "NM": NM } @@ -186,7 +188,7 @@ def strand_to_bool(strand): # For performing reverse complement -complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca') +complement = str.maketrans('ACGTacgt', 'TGCAtgca') class ShellCapture: diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 20d9107c..e32a0fa6 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -36,10 +36,11 @@ def setUpClass(self): 'Filter_s': "", 'Filter_t': "", 'Hierarchy': "1", + 'Overlap': "Partial", + 'Mismatch': "", 'Strand': "antisense", "nt5end": '"C,G,U"', # Needs to be double-quoted due to commas 'Length': "all", - 'Overlap': "Partial" } # Represents the parsed Features Sheet row above @@ -51,10 +52,11 @@ def setUpClass(self): 'Filter_s': _row['Filter_s'], 'Filter_t': _row['Filter_t'], 'Hierarchy': int(_row['Hierarchy']), + 'Overlap': _row['Overlap'].lower(), + 'Mismatch': _row['Mismatch'], 'Strand': _row['Strand'], 'nt5end': _row["nt5end"].upper().translate({ord('U'): 'T'}), 'Length': _row['Length'], - 'Overlap': _row['Overlap'].lower() }] # Represents an unparsed Samples Sheet row diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index 55c41410..236be93e 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -27,7 +27,7 @@ def make_feature_for_interval_test(self, iv_rule, feat_id, chrom, strand: str, s fs = FeatureSelector(rules) # Feature with specified coordinates, matching rule 0 with hierarchy 0 and the appropriate selector for iv_rule - selectors = fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule)]) + selectors = fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule, Wildcard())]) match_tuple = (selectors[feat_iv][0],) feat = {(feat_id, strand_to_bool(strand), match_tuple)} @@ -398,19 +398,19 @@ def test_build_interval_selectors_grouping(self): fs = FeatureSelector(deepcopy(rules_template)) iv = HTSeq.GenomicInterval('I', 10, 20, '+') - match_tuples = [('n/a', 'n/a', 'partial'), - ('n/a', 'n/a', 'nested'), - ('n/a', 'n/a', 'exact'), - ('n/a', 'n/a', "5' anchored"), - ('n/a', 'n/a', "3' anchored"), + match_tuples = [('n/a', 'n/a', 'partial', 'n/a'), + ('n/a', 'n/a', 'nested', 'n/a'), + ('n/a', 'n/a', 'exact', 'n/a'), + ('n/a', 'n/a', "5' anchored", 'n/a'), + ('n/a', 'n/a', "3' anchored", 'n/a'), # iv_shifted_1 Shift values: - ('n/a', 'n/a', 'partial, -5, 5'), # 5': -5 3': 5 - ('n/a', 'n/a', 'nested, -5, 5'), # 5': -5 3': 5 + ('n/a', 'n/a', 'partial, -5, 5', 'n/a'), # 5': -5 3': 5 + ('n/a', 'n/a', 'nested, -5, 5', 'n/a'), # 5': -5 3': 5 # iv_shifted_2 - ('n/a', 'n/a', 'exact, -10, 10'), # 5': -10 3': 10 + ('n/a', 'n/a', 'exact, -10, 10', 'n/a'), # 5': -10 3': 10 # iv_shifted_3 - ('n/a', 'n/a', "5' anchored, -1, -1"), # 5': -1 3': -1 - ('n/a', 'n/a', "3' anchored, -1, -1")] # 5': -1 3': -1 + ('n/a', 'n/a', "5' anchored, -1, -1", 'n/a'), # 5': -1 3': -1 + ('n/a', 'n/a', "3' anchored, -1, -1", 'n/a')] # 5': -1 3': -1 iv_shifted_1 = HTSeq.GenomicInterval('I', 5, 25, '+') iv_shifted_2 = HTSeq.GenomicInterval('I', 0, 30, '+') diff --git a/tests/unit_tests_matching.py b/tests/unit_tests_matching.py index 25f618dd..aa335349 100644 --- a/tests/unit_tests_matching.py +++ b/tests/unit_tests_matching.py @@ -20,15 +20,15 @@ def test_feature_selector_interval_build(self): iv = HTSeq.GenomicInterval('I', 0, 10, '+') # Match tuples formed during GFF parsing - match_tuples = [('n/a', 'n/a', 'partial'), - ('n/a', 'n/a', 'nested'), - ('n/a', 'n/a', 'exact'), - ('n/a', 'n/a', 'anchored'), - ('n/a', 'n/a', "5' anchored"), - ('n/a', 'n/a', "3' anchored"), - ('n/a', 'n/a', "5'anchored"), # spaces should be optional - ('n/a', 'n/a', "3'anchored")] - match_tuples += [('n/a', 'n/a', kwd) for kwd in Wildcard.kwds] + match_tuples = [('n/a', 'n/a', 'partial', Wildcard()), + ('n/a', 'n/a', 'nested', Wildcard()), + ('n/a', 'n/a', 'exact', Wildcard()), + ('n/a', 'n/a', 'anchored', Wildcard()), + ('n/a', 'n/a', "5' anchored", Wildcard()), + ('n/a', 'n/a', "3' anchored", Wildcard()), + ('n/a', 'n/a', "5'anchored", Wildcard()), # spaces should be optional + ('n/a', 'n/a', "3'anchored", Wildcard())] + match_tuples += [('n/a', 'n/a', kwd, Wildcard()) for kwd in Wildcard.kwds] result = fs.build_interval_selectors(iv, match_tuples) From b6857f867be2102166039940089423a4eca49047 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:15:50 -0700 Subject: [PATCH 13/21] Adding a unit test for SamSqValidator.read_sq_headers() --- tests/unit_tests_validation.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index 7df9cac5..32a0845f 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -6,7 +6,7 @@ from glob import glob from unittest.mock import patch, mock_open -from rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator +from tiny.rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator class GFFValidatorTest(unittest.TestCase): @@ -315,6 +315,23 @@ def make_parsed_sq_header(self, chroms): {'LN': chrom[1]} for chrom in chroms] + """Does read_sq_headers() return the expected data?""" + + def test_read_sq_headers(self): + sam_file = 'testdata/counter/validation/sam/sq_headers.sam' + expected = { + sam_file: [ + {'SN': 'I', 'LN': 123}, + {'SN': 'II', 'LN': 456}, + {'SN': 'III'} + ], + } + + validator = SamSqValidator([sam_file]) + validator.read_sq_headers() + + self.assertDictEqual(validator.sq_headers, expected) + """Does SamSqValidator correctly identify missing @SQ headers?""" def test_missing_sq_headers(self): From ee74985777bff4ff09ab4f8d07cd3911c4818cf6 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:19:01 -0700 Subject: [PATCH 14/21] Updating non-SAM_reader tests to use the new match tuple format --- tests/unit_tests_hts_parsing.py | 59 +++++++++++++++++---------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 587fdee0..70ddb75e 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -15,6 +15,7 @@ import unit_test_helpers as helpers resources = "./testdata/counter" +wc = Wildcard() # used in many tests as a 'n/a' value # To run all test suites if __name__ == '__main__': @@ -256,7 +257,7 @@ def test_ref_tables_single_feature(self): steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv), wc),))}]) self.assertEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) @@ -280,7 +281,7 @@ def test_ref_tables_single_feature_all_features_false(self): steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv), wc),))}]) self.assertDictEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) @@ -365,7 +366,7 @@ def test_ref_tables_identity_matches_multisource_concat(self): expected_matches = [ set(), - {(('Gene:WBGene00023193', ''), False, ((0, 1, ivm), (1, 2, ivm), (2, 3, ivm)))}, + {(('Gene:WBGene00023193', ''), False, ((0, 1, ivm, wc), (1, 2, ivm, wc), (2, 3, ivm, wc)))}, set() ] @@ -466,10 +467,10 @@ def test_ref_tables_discontinuous_identity_matches(self): sib_ivs = [HTSeq.GenomicInterval('I', 99, 110, '-'), HTSeq.GenomicInterval('I', 110, 120, '-'), HTSeq.GenomicInterval('I', 139, 150, '-')] - rule1_gp = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv)),) for iv in gp_ivs} - rule1_p2 = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv)),) for iv in p2_ivs} - rule2_sib = {f"{iv.start}:{iv.end}": (1, 3, IntervalPartialMatch(iv)) for iv in sib_ivs} - rule3_sib = {f"{iv.start}:{iv.end}": (2, 0, IntervalPartialMatch(iv)) for iv in sib_ivs} + rule1_gp = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv), wc),) for iv in gp_ivs} + rule1_p2 = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv), wc),) for iv in p2_ivs} + rule2_sib = {f"{iv.start}:{iv.end}": (1, 3, IntervalPartialMatch(iv), wc) for iv in sib_ivs} + rule3_sib = {f"{iv.start}:{iv.end}": (2, 0, IntervalPartialMatch(iv), wc) for iv in sib_ivs} # For tables that store features in tagged form GrandParent, Parent2, Sibling = ('GrandParent',''), ('Parent2',''), ('Sibling','') @@ -502,7 +503,7 @@ def test_ref_tables_source_filter(self): child2_iv = HTSeq.GenomicInterval('I', 39, 50, '-') exp_alias = {'Child2': ('Child2Name',)} - exp_feats = [set(), {(('Child2', ''), False, ((0, 0, IntervalPartialMatch(child2_iv)),))}, set()] + exp_feats = [set(), {(('Child2', ''), False, ((0, 0, IntervalPartialMatch(child2_iv), wc),))}, set()] exp_intervals = {'Child2': [child2_iv]} exp_classes = {'Child2': ('NA',)} exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"} @@ -527,7 +528,7 @@ def test_ref_tables_type_filter(self): child1_iv = HTSeq.GenomicInterval('I', 29, 40, '-') exp_alias = {'Child1': ('SharedName',)} - exp_feats = [set(), {(('Child1', ''), False, ((0, 0, IntervalPartialMatch(child1_iv)),))}, set()] + exp_feats = [set(), {(('Child1', ''), False, ((0, 0, IntervalPartialMatch(child1_iv), wc),))}, set()] exp_intervals = {'Child1': [child1_iv]} exp_classes = {'Child1': ('NA',)} exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"} @@ -574,8 +575,8 @@ def test_ref_tables_tagged_match_single(self): iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 3746, 3909)) expected_feats = [ set(), { - ((feat_id, 'tagged_match'), False, ((0, 1, iv),)), - ((feat_id, ''), False, ((1, 2, iv),)) + ((feat_id, 'tagged_match'), False, ((0, 1, iv, wc),)), + ((feat_id, ''), False, ((1, 2, iv, wc),)) }, set() ] @@ -606,12 +607,12 @@ def test_ref_tables_tagged_match_merging(self): Child2_iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 39, 50)) expected_feats = [ set(), { - (('Parent2', 'shared'), False, ((0, 1, Parent2_iv), (1, 2, Parent2_iv))), - (('Parent2', ''), False, ((2, 3, Parent2_iv),)), + (('Parent2', 'shared'), False, ((0, 1, Parent2_iv, wc), (1, 2, Parent2_iv, wc))), + (('Parent2', ''), False, ((2, 3, Parent2_iv, wc),)), }, set(), { - (('Parent2', 'shared'), False, ((0, 1, Child2_iv), (1, 2, Child2_iv))), - (('Parent2', ''), False, ((2, 3, Child2_iv),)) + (('Parent2', 'shared'), False, ((0, 1, Child2_iv, wc), (1, 2, Child2_iv, wc))), + (('Parent2', ''), False, ((2, 3, Child2_iv, wc),)) }, set() ] @@ -651,7 +652,7 @@ def get_steps(rs, chrom): def test_add_reference_seq_single(self): seq_id = "seq" seq_len = 10 - matches = {'': [(0, 0, "partial")]} + matches = {'': [(0, 0, "partial", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -660,7 +661,7 @@ def test_add_reference_seq_single(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) - match_tuple = ((0, 0, IntervalPartialMatch(iv)),) + match_tuple = ((0, 0, IntervalPartialMatch(iv), wc),) expected = { (0, 10): { @@ -677,7 +678,7 @@ def test_add_reference_seq_single(self): def test_add_reference_seq_shared_classifier(self): seq_id = "seq" seq_len = 10 - matches = {'shared': [(0, 0, "partial"), (1, 1, "nested")]} + matches = {'shared': [(0, 0, "partial", wc), (1, 1, "nested", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -686,7 +687,7 @@ def test_add_reference_seq_shared_classifier(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) - match_tuples = (0, 0, IntervalPartialMatch(iv)), (1, 1, IntervalNestedMatch(iv)) + match_tuples = (0, 0, IntervalPartialMatch(iv), wc), (1, 1, IntervalNestedMatch(iv), wc) expected = { (0, 10): { @@ -704,7 +705,7 @@ def test_add_reference_seq_shared_classifier(self): def test_add_reference_seq_shared_iv(self): seq_id = "seq" seq_len = 10 - matches = {'exact': [(0, 0, "exact, 2, -2")], 'nested': [(1, 1, "nested, 2, -2")]} + matches = {'exact': [(0, 0, "exact, 2, -2", wc)], 'nested': [(1, 1, "nested, 2, -2", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -713,8 +714,8 @@ def test_add_reference_seq_shared_iv(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 2, seq_len - 2) - match_exact = ((0, 0, IntervalExactMatch(iv)),) - match_nested = ((1, 1, IntervalNestedMatch(iv)),) + match_exact = ((0, 0, IntervalExactMatch(iv), wc),) + match_nested = ((1, 1, IntervalNestedMatch(iv), wc),) expected = { (0, 2): set(), @@ -737,8 +738,8 @@ def test_add_reference_seq_complex(self): seq_id = "seq" seq_len = 10 matches = { - "class1": [(0, 0, "nested, 1, -1"), (0, 0, "exact, 5, 0")], - "class2": [(0, 0, "5' anchored, 5, 0")] + "class1": [(0, 0, "nested, 1, -1", wc), (0, 0, "exact, 5, 0", wc)], + "class2": [(0, 0, "5' anchored, 5, 0", wc)] } rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) @@ -746,19 +747,19 @@ def test_add_reference_seq_complex(self): # Since nested shift is symmetric, iv is same on both strands iv_n = HTSeq.GenomicInterval(seq_id, 1, seq_len-1) - match_nested = ((0, 0, IntervalNestedMatch(iv_n)),) + match_nested = ((0, 0, IntervalNestedMatch(iv_n), wc),) # Since exact and anchored shift is asymmetric and by the same # amount, iv differs per strand but is shared by both selectors. # If they both had the same classifier then these match tuples # would share the same record tuple on both strands iv_e5_s = HTSeq.GenomicInterval(seq_id, 5, seq_len, '+') - match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s)),) - match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s)),) + match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s), wc),) + match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s), wc),) iv_e5_a = HTSeq.GenomicInterval(seq_id, 0, seq_len-5, '-') - match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a)),) - match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a)),) + match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a), wc),) + match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a), wc),) expected = { (0, 1): { From fc7039ed8fe9e420fd5e7d91d245cd86c71d50d3 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:20:09 -0700 Subject: [PATCH 15/21] Updating SAM_reader tests for the new alignment dictionary format (byte strings are now just strings) --- tests/unit_tests_hts_parsing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 70ddb75e..ab0e6c48 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -45,8 +45,8 @@ def test_sam_reader(self): self.assertEqual(sam_record['Start'], 15064569) self.assertEqual(sam_record['End'], 15064590) self.assertEqual(sam_record['Strand'], False) - self.assertEqual(sam_record['Name'], b"0_count=5") - self.assertEqual(sam_record['Seq'], b"CAAGACAGAGCTTCACCGTTC") + self.assertEqual(sam_record['Name'], "0_count=5") + self.assertEqual(sam_record['Seq'], "CAAGACAGAGCTTCACCGTTC") self.assertEqual(sam_record['Length'], 21) self.assertEqual(sam_record['nt5end'], 'G') @@ -69,13 +69,13 @@ def test_sam_parser_comparison(self): self.assertEqual(our['Chrom'], their.iv.chrom) self.assertEqual(our['Start'], their.iv.start) self.assertEqual(our['End'], their.iv.end) - self.assertEqual(our['Name'].decode(), their.read.name) + self.assertEqual(our['Name'], their.read.name) self.assertEqual(our['nt5end'], chr(their.read.seq[0])) # See note above self.assertEqual(our['Strand'], helpers.strand_to_bool(their.iv.strand)) if our['Strand'] is False: # See note above - self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq) + self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq.decode()) else: - self.assertEqual(our['Seq'], their.read.seq) + self.assertEqual(our['Seq'], their.read.seq.decode()) """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" From 414991b50e7e7536ad7dd7fd4ca45eae7961a431 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:20:47 -0700 Subject: [PATCH 16/21] Updating SAM_reader tests for the new class design --- tests/unit_tests_hts_parsing.py | 91 +++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index ab0e6c48..8312e028 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -87,38 +87,60 @@ def test_SAM_reader_get_decollapsed_filename(self): self.assertEqual(sam_out, "sam_file_decollapsed.sam") - """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?""" + """Does SAM_reader._new_bundle report the correct read count for different collapser types?""" - def test_SAM_reader_read_thru_header(self): + def test_SAM_reader_new_bundle(self): + qnames = ["0_count=3", "seq0_x5", "non-collapsed"] + counts = [3, 5, 1] + + reader = SAM_reader() + reader._header_dict = {'HD': {'SO': 'queryname'}} + + for qname, expected in zip(qnames, counts): + reader._determine_collapser_type(qname) + _, read_count = reader._new_bundle({'Name': qname}) + self.assertEqual(read_count, expected) + + """Does SAM_reader._gather_metadata() correctly identify metadata and write the decollapsed file header?""" + + def test_SAM_reader_gather_metadata(self): reader = SAM_reader(decollapse=True) reader._decollapsed_filename = "mock_outfile_name.sam" - with open(self.short_sam_file, 'rb') as sam_in: - with patch('builtins.open', mock_open()) as sam_out: - line = reader._read_to_first_aln(sam_in) + sam_in = pysam.AlignmentFile(self.short_sam_file) + with patch('builtins.open', mock_open()) as sam_out: + reader._gather_metadata(sam_in) expected_writelines = [ call('mock_outfile_name.sam', 'w'), call().__enter__(), - call().writelines(["@SQ SN:I LN:21\n"]), + call().write("@SQ\tSN:I\tLN:21\n"), call().__exit__(None, None, None) ] sam_out.assert_has_calls(expected_writelines) - self.assertTrue(len(reader._header_lines) == 1) + self.assertEqual(reader.collapser_type, 'tiny-collapse') + self.assertDictEqual(reader._header_dict, {'SQ': [{'SN': "I", 'LN': 21}]}) + self.assertEqual(reader.references, ('I',)) + self.assertTrue(reader.has_nm) """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" def test_SAM_reader_write_decollapsed_sam(self): + header = pysam.AlignmentHeader() + alignment = pysam.AlignedSegment(header) + alignment.query_name = "0_count=5" + reader = SAM_reader(decollapse=True) reader.collapser_type = "tiny-collapse" - reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")] + reader._collapser_token = "=" + reader._decollapsed_reads = [alignment] reader._decollapsed_filename = "mock_outfile_name.sam" expected_writelines = [ call('mock_outfile_name.sam', 'ab'), call().__enter__(), - call().writelines([b"mock line from SAM file"] * 5), + call().writelines([alignment.to_string()] * 5), call().__exit__(None, None, None) ] @@ -131,37 +153,28 @@ def test_SAM_reader_write_decollapsed_sam(self): """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" def test_SAM_reader_parse_alignments_decollapse(self): - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \ - patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen: + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn: + # Set up SAM_reader class reader = SAM_reader(decollapse=True) - reader._decollapsed_reads = [0] * 99999 # At 100,001, buffer will be written - reader.file = self.short_sam_file # File with single alignment - - with open(self.short_sam_file, 'rb') as sam_in: - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_not_called() - - # Rewind and add one more alignment to push it over threshold - sam_in.seek(0) - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_called_once() - - """Does SAM_reader report a single read count for non-collapsed SAM records?""" - - def test_SAM_reader_single_readcount_non_collapsed_SAM(self): - # Read non-collapsed.sam but duplicate its single record twice - with open(f"{resources}/non-collapsed.sam", 'rb') as f: - sam_lines = f.readlines() - sam_lines.extend([sam_lines[1]] * 2) - mock_file = mock_open(read_data=b''.join(sam_lines)) - - with patch('tiny.rna.counter.hts_parsing.open', new=mock_file): - reader = SAM_reader() - bundle, read_count = next(reader.bundle_multi_alignments('mock_file')) - - self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME') - self.assertEqual(len(bundle), 3) - self.assertEqual(read_count, 1) + reader.collapser_type = "tiny-collapse" + reader._decollapsed_reads = buffer = [0] * 99999 # At 100,001, expect buffer to be written + reader.file = self.short_sam_file # File with single alignment + + # Set up AlignmentIter class + sam_in = pysam.AlignmentFile(reader.file) + callback = reader._write_decollapsed_sam + first_aln_offset = sam_in.tell() + has_nm = True + aln_iter = AlignmentIter(sam_in, has_nm, callback, buffer) + + # Add 100,000th alignment to the buffer + self.exhaust_iterator(aln_iter) + write_fn.assert_not_called() + + # Rewind and add one more alignment to push it over threshold + sam_in.seek(first_aln_offset) + self.exhaust_iterator(aln_iter) + write_fn.assert_called_once() """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" From a941c630b4622a9fbacce2b8337d1116dcd92fbd Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 16:21:44 -0700 Subject: [PATCH 17/21] Adding sq_headers.sam testfile for unit_tests_validation.py --- tests/testdata/counter/validation/sam/sq_headers.sam | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tests/testdata/counter/validation/sam/sq_headers.sam diff --git a/tests/testdata/counter/validation/sam/sq_headers.sam b/tests/testdata/counter/validation/sam/sq_headers.sam new file mode 100644 index 00000000..390d94cf --- /dev/null +++ b/tests/testdata/counter/validation/sam/sq_headers.sam @@ -0,0 +1,4 @@ +@HD VN:1.0 SO:unsorted +@SQ SN:I LN:123 +@SQ SN:II LN:456 +@SQ SN:III \ No newline at end of file From c1459545d2db358cb8b6bc60382030f5c6d0d44b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 3 Apr 2023 17:45:53 -0700 Subject: [PATCH 18/21] Bugfixes for decollapsed outputs and sequence-based counting mode --- tiny/rna/counter/hts_parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 1717f42a..469e184f 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -151,8 +151,8 @@ def _write_decollapsed_sam(self) -> None: aln_out.extend([aln.to_string()] * seq_count) prev_name = name - with open(self._get_decollapsed_filename(), 'ab') as sam_o: - sam_o.writelines(aln_out) + with open(self._get_decollapsed_filename(), 'a') as sam_o: + sam_o.write('\n'.join(aln_out)) self._decollapsed_reads.clear() @@ -773,7 +773,7 @@ def get_matches(self) -> Dict[str, List[Tuple[int, int, str]]]: matches_by_classifier = defaultdict(list) for idx, rule in enumerate(self.selector.rules_table): - match_tuple = (idx, rule['Hierarchy'], rule['Overlap']) + match_tuple = (idx, rule['Hierarchy'], rule['Overlap'], rule['Mismatch']) matches_by_classifier[rule['Class']].append(match_tuple) return matches_by_classifier From 2a1c80e7fa206da4a6783529a94cf8a1c670089e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 17:49:51 -0700 Subject: [PATCH 19/21] Final polish updates to AlignmentIter and a related unit test --- tests/unit_tests_hts_parsing.py | 4 ++-- tiny/rna/counter/parsing/alignments.pyx | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 8312e028..4af12415 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -138,9 +138,9 @@ def test_SAM_reader_write_decollapsed_sam(self): reader._decollapsed_filename = "mock_outfile_name.sam" expected_writelines = [ - call('mock_outfile_name.sam', 'ab'), + call('mock_outfile_name.sam', 'a'), call().__enter__(), - call().writelines([alignment.to_string()] * 5), + call().write('\n'.join([alignment.to_string()] * 5)), call().__exit__(None, None, None) ] diff --git a/tiny/rna/counter/parsing/alignments.pyx b/tiny/rna/counter/parsing/alignments.pyx index 0b7fbb48..0cb7ce4b 100644 --- a/tiny/rna/counter/parsing/alignments.pyx +++ b/tiny/rna/counter/parsing/alignments.pyx @@ -35,13 +35,13 @@ cdef class AlignmentIter: cdef dict _next_alignment(self): cdef: - AlignedSegment aln = next(self.reader) # Equivalent API calls: + AlignedSegment aln = next(self.reader) # Equivalent Python API calls: int flag = aln._delegate.core.flag # aln.flag - str seq = aln.query_sequence # aln.query_sequence + str seq = aln.query_sequence int start = aln._delegate.core.pos # aln.reference_start int length = aln._delegate.core.l_qseq # aln.query_length bint strand = (flag & BAM_FREVERSE) == 0 # aln.is_forward - str nt5 = self._get_nt5(seq, strand) + str nt5 = AlignmentIter._get_nt5(seq, strand) str name = pysam_bam_get_qname(aln._delegate).decode() # aln.query_name if (flag & BAM_FUNMAP) != 0: # aln.is_unmapped @@ -78,7 +78,8 @@ cdef class AlignmentIter: # Calculate mismatches using the CIGAR string's I, D, and X operations return sum(op_len for op, op_len in aln.cigartuples if op in cigar_mismatch) - cdef str _get_nt5(self, str seq, bint strand): + @staticmethod + cdef str _get_nt5(str seq, bint strand): cdef str nt if strand: From ee1bfcbc645a10944a413130c5854ffb2727a9f4 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 17:50:53 -0700 Subject: [PATCH 20/21] Documentation updates for the Mismatches selector --- doc/Configuration.md | 1 + doc/tiny-count.md | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/doc/Configuration.md b/doc/Configuration.md index fc5ce86f..44026262 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -157,6 +157,7 @@ Selectors in the Features Sheet can be specified as a single value, a list of co | `Type Filter` | ✓ | ✓ | ✓ | | | `Hierarchy` | | ✓ | | | | `Overlap` | ✓ | ✓ | | | +| `Mismatches` | ✓ | ✓ | ✓ | ✓ | | `Strand` | ✓ | ✓ | | | | `5' nt` | ✓ | ✓ | ✓ | | | `Length` | ✓ | ✓ | ✓ | ✓ | diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 96438d47..a1fe0550 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -86,6 +86,12 @@ selector, M, N #### Unstranded Features If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished. +### Mismatches +The Mismatch column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values. + +#### Edit Distance Determination +An alignment's edit distance is determined from its NM tag. If the first alignment in a SAM file doesn't have an NM tag, then the edit distance is calculated from the CIGAR string for all subsequent alignments in the file. If the first alignment has an NM tag then any subsequent alignments missing the tag will have a default edit distance of 0. + ### Hierarchy Each rule must be assigned a hierarchy value. This value is used to sort Stage 2 matches so that matches with smaller hierarchy values take precedence in Stage 3. - Each feature can have multiple hierarchy values if it matched more than one rule during Stage 1 selection From 4d87ff180e948aec61790fb3bc353bf4e8f7e547 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 17:54:42 -0700 Subject: [PATCH 21/21] Typo correction --- doc/tiny-count.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tiny-count.md b/doc/tiny-count.md index a1fe0550..93af9b64 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -87,7 +87,7 @@ selector, M, N If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished. ### Mismatches -The Mismatch column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values. +The Mismatches column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values. #### Edit Distance Determination An alignment's edit distance is determined from its NM tag. If the first alignment in a SAM file doesn't have an NM tag, then the edit distance is calculated from the CIGAR string for all subsequent alignments in the file. If the first alignment has an NM tag then any subsequent alignments missing the tag will have a default edit distance of 0.