diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 219af31a..96438d47 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -27,7 +27,7 @@ Selection occurs in three stages, with the output of each stage as input to the ![Feature Selection Diagram](../images/tiny-count_selection.png) ## Sequence-Based Counting Mode -If GFF files aren't specified in your Paths File, Stage 1 selection is skipped and reads are counted by sequence rather than by feature. Reference sequence names and lengths are determined from the `@SQ` headers of input SAM files. In Stages 2 and 3, these reference sequences behave like features that had matched every rule in Stage 1. +If GFF files aren't specified in your Paths File, Stage 1 selection is skipped and reads are counted by sequence rather than by feature. Reference sequence names and lengths are determined from the `@SQ` headers of input SAM files. In Stages 2 and 3, these reference sequences behave like features that had matched every rule in Stage 1. Selection takes place for both the sense and antisense copy of these sequences, and read counts are subset per-sequence if any `Classify as...` values are provided. ## Stage 1: Feature Attribute Parameters | _Features Sheet Selectors:_ | Select for... | with value... | Classify as... | Source Filter | Type Filter | diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 3f8e9ea6..587fdee0 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -9,47 +9,27 @@ from unittest.mock import patch, mock_open, call from tiny.rna.counter.features import FeatureSelector -from tiny.rna.counter.matching import IntervalPartialMatch -from tiny.rna.counter.statistics import LibraryStats +from tiny.rna.counter.matching import * from tiny.rna.counter.hts_parsing import * import unit_test_helpers as helpers resources = "./testdata/counter" +# To run all test suites +if __name__ == '__main__': + unittest.main() -class MyTestCase(unittest.TestCase): +class SamReaderTests(unittest.TestCase): @classmethod def setUpClass(self): - self.gff_file = f"{resources}/identity_choice_test.gff3" - self.short_gff_file = f"{resources}/single.gff3" - self.short_gff = helpers.read(self.short_gff_file) - self.sam_file = f"{resources}/identity_choice_test.sam" self.short_sam_file = f"{resources}/single.sam" self.short_sam = helpers.read(self.short_sam_file) - self.maxDiff = None - - # === HELPERS === - - def get_gff_attr_string(self, gff_line): - return gff_line.split('\t')[-1] - - def parse_gff_attr(self, gff_file_content): - attr_str = self.get_gff_attr_string(gff_file_content) - return parse_GFF_attribute_string(attr_str) - - def selector_with_template(self, updates_list): - """Returns a MockFeatureSelector with the specified updates to the default rule template""" - - rules = [deepcopy(helpers.rules_template[0]) for _ in range(len(updates_list))] - for changes, template in zip(updates_list, rules): - template.update(changes) - return FeatureSelector(rules) - - def exhaust_iterator(self, it): + @staticmethod + def exhaust_iterator(it): collections.deque(it, maxlen=0) # === TESTS === @@ -70,7 +50,7 @@ def test_sam_reader(self): self.assertEqual(sam_record['nt5end'], 'G') """Does our custom SAM parser produce the same pertinent info as HTSeq's BAM_reader? - + A note on SAM files: reads are always stored 5' to 3', so antisense reads are actually recorded in reverse complement. HTSeq automatically performs this conversion, but we are only really concerned about a sequence's 5' end NT, so our alignment dicts performs @@ -91,11 +71,145 @@ def test_sam_parser_comparison(self): self.assertEqual(our['Name'].decode(), their.read.name) self.assertEqual(our['nt5end'], chr(their.read.seq[0])) # See note above self.assertEqual(our['Strand'], helpers.strand_to_bool(their.iv.strand)) - if our['Strand'] is False: # See note above + if our['Strand'] is False: # See note above self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq) else: self.assertEqual(our['Seq'], their.read.seq) + """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" + + def test_SAM_reader_get_decollapsed_filename(self): + reader = SAM_reader() + reader.file = "~/path/to/input/sam_file.sam" + + sam_out = reader._get_decollapsed_filename() + + self.assertEqual(sam_out, "sam_file_decollapsed.sam") + + """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?""" + + def test_SAM_reader_read_thru_header(self): + reader = SAM_reader(decollapse=True) + reader._decollapsed_filename = "mock_outfile_name.sam" + + with open(self.short_sam_file, 'rb') as sam_in: + with patch('builtins.open', mock_open()) as sam_out: + line = reader._read_to_first_aln(sam_in) + + expected_writelines = [ + call('mock_outfile_name.sam', 'w'), + call().__enter__(), + call().writelines(["@SQ SN:I LN:21\n"]), + call().__exit__(None, None, None) + ] + + sam_out.assert_has_calls(expected_writelines) + self.assertTrue(len(reader._header_lines) == 1) + + """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" + + def test_SAM_reader_write_decollapsed_sam(self): + reader = SAM_reader(decollapse=True) + reader.collapser_type = "tiny-collapse" + reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")] + reader._decollapsed_filename = "mock_outfile_name.sam" + + expected_writelines = [ + call('mock_outfile_name.sam', 'ab'), + call().__enter__(), + call().writelines([b"mock line from SAM file"] * 5), + call().__exit__(None, None, None) + ] + + with patch('builtins.open', mock_open()) as outfile: + reader._write_decollapsed_sam() + + outfile.assert_has_calls(expected_writelines) + self.assertTrue(len(reader._decollapsed_reads) == 0) + + """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" + + def test_SAM_reader_parse_alignments_decollapse(self): + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \ + patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen: + reader = SAM_reader(decollapse=True) + reader._decollapsed_reads = [0] * 99999 # At 100,001, buffer will be written + reader.file = self.short_sam_file # File with single alignment + + with open(self.short_sam_file, 'rb') as sam_in: + self.exhaust_iterator(reader._parse_alignments(sam_in)) + write_fn.assert_not_called() + + # Rewind and add one more alignment to push it over threshold + sam_in.seek(0) + self.exhaust_iterator(reader._parse_alignments(sam_in)) + write_fn.assert_called_once() + + """Does SAM_reader report a single read count for non-collapsed SAM records?""" + + def test_SAM_reader_single_readcount_non_collapsed_SAM(self): + # Read non-collapsed.sam but duplicate its single record twice + with open(f"{resources}/non-collapsed.sam", 'rb') as f: + sam_lines = f.readlines() + sam_lines.extend([sam_lines[1]] * 2) + mock_file = mock_open(read_data=b''.join(sam_lines)) + + with patch('tiny.rna.counter.hts_parsing.open', new=mock_file): + reader = SAM_reader() + bundle, read_count = next(reader.bundle_multi_alignments('mock_file')) + + self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME') + self.assertEqual(len(bundle), 3) + self.assertEqual(read_count, 1) + + """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" + + def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): + stdout_capture = io.StringIO() + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \ + patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header: + with contextlib.redirect_stderr(stdout_capture): + reader = SAM_reader(decollapse=True) + records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam") + self.exhaust_iterator(records) + + write_sam.assert_not_called() + write_header.assert_not_called() + self.assertEqual(reader.collapser_type, None) + self.assertEqual(stdout_capture.getvalue(), + "Alignments do not appear to be derived from a supported collapser input. " + "Decollapsed SAM files will therefore not be produced.\n") + + +class ReferenceFeaturesTests(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.gff_file = f"{resources}/identity_choice_test.gff3" + self.short_gff_file = f"{resources}/single.gff3" + self.short_gff = helpers.read(self.short_gff_file) + + self.maxDiff = None + + # === HELPERS === + + def get_gff_attr_string(self, gff_line): + return gff_line.split('\t')[-1] + + def parse_gff_attr(self, gff_file_content): + attr_str = self.get_gff_attr_string(gff_file_content) + return parse_GFF_attribute_string(attr_str) + + def selector_with_template(self, updates_list): + """Returns a MockFeatureSelector with the specified updates to the default rule template""" + + rules = [deepcopy(helpers.rules_template[0]) for _ in range(len(updates_list))] + for changes, template in zip(updates_list, rules): + template.update(changes) + return FeatureSelector(rules) + + # === TESTS === + """Were only the correct attribute keys present in the parser result?""" def test_gff_attr_keys(self): @@ -509,111 +623,171 @@ def test_ref_tables_tagged_match_merging(self): self.assertDictEqual(aliases, expected_aliases) self.assertDictEqual(tags, expected_tags) - """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" - def test_SAM_reader_get_decollapsed_filename(self): - reader = SAM_reader() - reader.file = "~/path/to/input/sam_file.sam" +class ReferenceSequencesTests(unittest.TestCase): - sam_out = reader._get_decollapsed_filename() + @classmethod + def setUpClass(cls) -> None: + cls.maxDiff = None - self.assertEqual(sam_out, "sam_file_decollapsed.sam") + # === HELPERS === - """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?""" + @staticmethod + def ReferenceSeqs_add(seq_id, seq_len, matches): + rs = ReferenceSeqs({seq_id, seq_len}) + rs.selector = FeatureSelector([]) + rs.add_reference_seq(seq_id, seq_len, matches) + return rs - def test_SAM_reader_read_thru_header(self): - reader = SAM_reader(decollapse=True) - reader._decollapsed_filename = "mock_outfile_name.sam" + @staticmethod + def get_steps(rs, chrom): + return {(step[0], step[1]): step[2] + for step in rs.feats[chrom]['.'].array.get_steps()} - with open(self.short_sam_file, 'rb') as sam_in: - with patch('builtins.open', mock_open()) as sam_out: - line = reader._read_to_first_aln(sam_in) + # === TESTS === - expected_writelines = [ - call('mock_outfile_name.sam', 'w'), - call().__enter__(), - call().writelines(["@SQ SN:I LN:21\n"]), - call().__exit__(None, None, None) - ] + """Does add_reference_seq() produce the expected GenomicArray for an untagged rule matching a single feature?""" - sam_out.assert_has_calls(expected_writelines) - self.assertTrue(len(reader._header_lines) == 1) + def test_add_reference_seq_single(self): + seq_id = "seq" + seq_len = 10 + matches = {'': [(0, 0, "partial")]} - """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" + rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) + actual = self.get_steps(rs, seq_id) - def test_SAM_reader_write_decollapsed_sam(self): - reader = SAM_reader(decollapse=True) - reader.collapser_type = "tiny-collapse" - reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")] - reader._decollapsed_filename = "mock_outfile_name.sam" + # The record_tuples should have intervals on both strands + # and the overlap selector should have the same interval. + # For these selectors, same interval on both strands. + iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) + match_tuple = ((0, 0, IntervalPartialMatch(iv)),) - expected_writelines = [ - call('mock_outfile_name.sam', 'ab'), - call().__enter__(), - call().writelines([b"mock line from SAM file"] * 5), - call().__exit__(None, None, None) - ] - - with patch('builtins.open', mock_open()) as outfile: - reader._write_decollapsed_sam() - - outfile.assert_has_calls(expected_writelines) - self.assertTrue(len(reader._decollapsed_reads) == 0) - - """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" - - def test_SAM_reader_parse_alignments_decollapse(self): - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \ - patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen: + expected = { + (0, 10): { + ((seq_id, ''), True, match_tuple), # sense + ((seq_id, ''), False, match_tuple) # antisense + }, + (10, sys.maxsize): set() + } - reader = SAM_reader(decollapse=True) - reader._decollapsed_reads = [0] * 99999 # At 100,001, buffer will be written - reader.file = self.short_sam_file # File with single alignment + self.assertDictEqual(actual, expected) - with open(self.short_sam_file, 'rb') as sam_in: - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_not_called() + """Does add_reference_seq() produce the expected GenomicArray when rules share a classifier?""" - # Rewind and add one more alignment to push it over threshold - sam_in.seek(0) - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_called_once() + def test_add_reference_seq_shared_classifier(self): + seq_id = "seq" + seq_len = 10 + matches = {'shared': [(0, 0, "partial"), (1, 1, "nested")]} - """Does SAM_reader report a single read count for non-collapsed SAM records?""" + rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) + actual = self.get_steps(rs, seq_id) - def test_SAM_reader_single_readcount_non_collapsed_SAM(self): - # Read non-collapsed.sam but duplicate its single record twice - with open(f"{resources}/non-collapsed.sam", 'rb') as f: - sam_lines = f.readlines() - sam_lines.extend([sam_lines[1]] * 2) - mock_file = mock_open(read_data=b''.join(sam_lines)) + # The record_tuples will have intervals on both strands + # and the overlap selector should have the same interval. + # For these selectors, same interval on both strands. + iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) + match_tuples = (0, 0, IntervalPartialMatch(iv)), (1, 1, IntervalNestedMatch(iv)) - with patch('tiny.rna.counter.hts_parsing.open', new=mock_file): - reader = SAM_reader() - bundle, read_count = next(reader.bundle_multi_alignments('mock_file')) + expected = { + (0, 10): { + ((seq_id, "shared"), True, match_tuples), # sense + ((seq_id, "shared"), False, match_tuples) # antisense + }, + (10, sys.maxsize): set() + } - self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME') - self.assertEqual(len(bundle), 3) - self.assertEqual(read_count, 1) + self.assertDictEqual(actual, expected) + + """Does add_reference_seq() produce the expected GenomicArray when shift parameters + from different rules produce the same interval?""" + + def test_add_reference_seq_shared_iv(self): + seq_id = "seq" + seq_len = 10 + matches = {'exact': [(0, 0, "exact, 2, -2")], 'nested': [(1, 1, "nested, 2, -2")]} + + rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) + actual = self.get_steps(rs, seq_id) + + # The record_tuples will have intervals on both strands + # and the overlap selector should have the same interval. + # For these selectors, same interval on both strands. + iv = HTSeq.GenomicInterval(seq_id, 2, seq_len - 2) + match_exact = ((0, 0, IntervalExactMatch(iv)),) + match_nested = ((1, 1, IntervalNestedMatch(iv)),) + + expected = { + (0, 2): set(), + (2, 8): { + ((seq_id, "exact"), True, match_exact), # sense + ((seq_id, "exact"), False, match_exact), # antisense + ((seq_id, "nested"), True, match_nested), # sense + ((seq_id, "nested"), False, match_nested) # antisense + }, + (8, sys.maxsize): set() + } - """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" + self.assertDictEqual(actual, expected) + + """Does add_reference_seq() produce the expected GenomicArray when there are + multiple tagged rules with different shift intervals, including asymmetric shifts + and an overlap selector that produces different intervals on each strand?""" + + def test_add_reference_seq_complex(self): + seq_id = "seq" + seq_len = 10 + matches = { + "class1": [(0, 0, "nested, 1, -1"), (0, 0, "exact, 5, 0")], + "class2": [(0, 0, "5' anchored, 5, 0")] + } - def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): - stdout_capture = io.StringIO() - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \ - patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header: + rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) + actual = self.get_steps(rs, seq_id) + + # Since nested shift is symmetric, iv is same on both strands + iv_n = HTSeq.GenomicInterval(seq_id, 1, seq_len-1) + match_nested = ((0, 0, IntervalNestedMatch(iv_n)),) + + # Since exact and anchored shift is asymmetric and by the same + # amount, iv differs per strand but is shared by both selectors. + # If they both had the same classifier then these match tuples + # would share the same record tuple on both strands + iv_e5_s = HTSeq.GenomicInterval(seq_id, 5, seq_len, '+') + match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s)),) + match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s)),) + + iv_e5_a = HTSeq.GenomicInterval(seq_id, 0, seq_len-5, '-') + match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a)),) + match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a)),) + + expected = { + (0, 1): { + (('seq', 'class2'), False, match_5anch_antis), + (('seq', 'class1'), False, match_exact_antis), + }, + (1, 5): { + (('seq', 'class1'), True, match_nested), + (('seq', 'class2'), False, match_5anch_antis), + (('seq', 'class1'), False, match_exact_antis), + (('seq', 'class1'), False, match_nested) + }, + (5, 9): { + (('seq', 'class1'), True, match_exact_sense), + (('seq', 'class1'), True, match_nested), + (('seq', 'class2'), True, match_5anch_sense), + (('seq', 'class1'), False, match_nested) + }, + (9, 10): { + (('seq', 'class1'), True, match_exact_sense), + (('seq', 'class2'), True, match_5anch_sense) + }, + (10, sys.maxsize): set() + } + + self.assertDictEqual(actual, expected) - with contextlib.redirect_stderr(stdout_capture): - reader = SAM_reader(decollapse=True) - records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam") - self.exhaust_iterator(records) - write_sam.assert_not_called() - write_header.assert_not_called() - self.assertEqual(reader.collapser_type, None) - self.assertEqual(stdout_capture.getvalue(), - "Alignments do not appear to be derived from a supported collapser input. " - "Decollapsed SAM files will therefore not be produced.\n") +class CaseInsensitiveAttrsTests(unittest.TestCase): """Does CaseInsensitiveAttrs correctly store, check membership, and retrieve?""" @@ -761,6 +935,3 @@ def test_gff_megazord(self): # The test is passed if this command # completes without throwing errors. rt.get(fs) - -if __name__ == '__main__': - unittest.main() diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index eefe28d8..cf563aa6 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -817,16 +817,16 @@ class ReferenceSeqs(ReferenceBase): def __init__(self, reference_seqs, **prefs): super().__init__(**prefs) + self.tags = defaultdict(set) self.seqs = reference_seqs self.alias = {} - self.tags = {} def get(self, selector): self.selector = selector - match_tuples = self.get_matches() + matches = self.get_matches() for seq_id, seq_len in self.seqs.items(): - self.add_reference_seq(seq_id, seq_len, match_tuples) + self.add_reference_seq(seq_id, seq_len, matches) # GenomicArray is built. Clear cache... self.selector.overlap_cache.clear() @@ -838,30 +838,32 @@ def get(self, selector): aliases = {seq_id: () for seq_id in self.tags} return self.feats, aliases, self.tags - def get_matches(self): + def get_matches(self) -> Dict[str, List[Tuple[int, int, str]]]: """Stage 1 selection is skipped in sequence-based counting. - Simply build match_tuples for all rules. These will be used + Want the reference sequences to enter Stage 2 as though they + were features that matched every rule in Stage 1. Simply return + match_tuples and their classifier for all rules. These are used uniformly in each reference sequence's feature_record_tuple""" - return [(idx, rule['Hierarchy'], rule['Overlap']) - for idx, rule in sorted( - enumerate(self.selector.rules_table), - key=lambda x: x[1]['Hierarchy'] - )] + matches_by_classifier = defaultdict(list) - def add_reference_seq(self, seq_id, seq_len, matches): + for idx, rule in enumerate(self.selector.rules_table): + match_tuple = (idx, rule['Hierarchy'], rule['Overlap']) + matches_by_classifier[rule['Class']].append(match_tuple) - # Features are classified in Reference Tables (Stage 1 selection) - # For compatibility, use the seq_id with an empty classifier (index 1) - tagged_id = (seq_id, '') - self.tags[seq_id] = {tagged_id} + return matches_by_classifier - for strand in ('+', '-'): - iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand) - matches_by_shifted_iv = self.selector.build_interval_selectors(iv, matches, tagged_id) - strand = self.map_strand(strand) + def add_reference_seq(self, seq_id, seq_len, matches_by_classifier): + for classifier, matches in matches_by_classifier.items(): + tagged_id = (seq_id, classifier) + self.tags[seq_id].add(tagged_id) - for shifted_iv, built_matches in matches_by_shifted_iv.items(): - # Sort match tuples by rank for more efficient feature selection - sorted_matches = sorted(built_matches, key=lambda x: x[1]) - self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) + for strand in ('+', '-'): + iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand) + matches_by_shifted_iv = self.selector.build_interval_selectors(iv, matches, tagged_id) + strand = self.map_strand(strand) + + for shifted_iv, built_matches in matches_by_shifted_iv.items(): + # Sort match tuples by rank for more efficient feature selection + sorted_matches = sorted(built_matches, key=lambda x: x[1]) + self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 45f331f8..43fd2af5 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -33,9 +33,11 @@ def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None): self.gff_files = gff_files def validate(self): - print("Validating annotation files...") + print("Validating annotation files... ", end='') self.parse_and_validate_gffs(self.gff_files) self.validate_chroms(*self.seq_files) + print("done.") + self.report.print_report() if self.report.errors: sys.exit(1) @@ -243,9 +245,11 @@ def __init__(self, sam_files): self.sq_headers = {} def validate(self): - print("Validating sequence identifiers in SAM files...") + print("Validating sequence identifiers in SAM files... ", end='') self.read_sq_headers() self.validate_sq_headers() + print("done.") + self.report.print_report() if self.report.errors: sys.exit(1)