From 7f2087eb54d63222872398e976ae7014181a1a6a Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 4 Mar 2023 11:57:06 -0800
Subject: [PATCH 1/5] Splitting up the gigantic test class in
 unit_tests_hts_parsing.py so that there's a class for SAM_reader,
 ReferenceFeatures, and CaseInsensitiveAttrs. Also moving the function for
 running all test suites up to the top of the file for convenience

---
 tests/unit_tests_hts_parsing.py | 180 +++++++++++++++++++++++++-------
 1 file changed, 144 insertions(+), 36 deletions(-)

diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
index 3f8e9ea6..63eb25cc 100644
--- a/tests/unit_tests_hts_parsing.py
+++ b/tests/unit_tests_hts_parsing.py
@@ -17,39 +17,20 @@
 
 resources = "./testdata/counter"
 
+# To run all test suites
+if __name__ == '__main__':
+    unittest.main()
 
-class MyTestCase(unittest.TestCase):
 
+class SamReaderTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        self.gff_file = f"{resources}/identity_choice_test.gff3"
-        self.short_gff_file = f"{resources}/single.gff3"
-        self.short_gff = helpers.read(self.short_gff_file)
-
         self.sam_file = f"{resources}/identity_choice_test.sam"
         self.short_sam_file = f"{resources}/single.sam"
         self.short_sam = helpers.read(self.short_sam_file)
 
-        self.maxDiff = None
-
-    # === HELPERS ===
-
-    def get_gff_attr_string(self, gff_line):
-        return gff_line.split('\t')[-1]
-
-    def parse_gff_attr(self, gff_file_content):
-        attr_str = self.get_gff_attr_string(gff_file_content)
-        return parse_GFF_attribute_string(attr_str)
-
-    def selector_with_template(self, updates_list):
-        """Returns a MockFeatureSelector with the specified updates to the default rule template"""
-
-        rules = [deepcopy(helpers.rules_template[0]) for _ in range(len(updates_list))]
-        for changes, template in zip(updates_list, rules):
-            template.update(changes)
-        return FeatureSelector(rules)
-
-    def exhaust_iterator(self, it):
+    @staticmethod
+    def exhaust_iterator(it):
         collections.deque(it, maxlen=0)
 
     # === TESTS ===
@@ -70,7 +51,7 @@ def test_sam_reader(self):
         self.assertEqual(sam_record['nt5end'], 'G')
 
     """Does our custom SAM parser produce the same pertinent info as HTSeq's BAM_reader?
-    
+
     A note on SAM files: reads are always stored 5' to 3', so antisense reads are actually
     recorded in reverse complement. HTSeq automatically performs this conversion, but we
     are only really concerned about a sequence's 5' end NT, so our alignment dicts performs
@@ -91,11 +72,145 @@ def test_sam_parser_comparison(self):
                 self.assertEqual(our['Name'].decode(), their.read.name)
                 self.assertEqual(our['nt5end'], chr(their.read.seq[0]))  # See note above
                 self.assertEqual(our['Strand'], helpers.strand_to_bool(their.iv.strand))
-                if our['Strand'] is False:                               # See note above
+                if our['Strand'] is False:  # See note above
                     self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq)
                 else:
                     self.assertEqual(our['Seq'], their.read.seq)
 
+    """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?"""
+
+    def test_SAM_reader_get_decollapsed_filename(self):
+        reader = SAM_reader()
+        reader.file = "~/path/to/input/sam_file.sam"
+
+        sam_out = reader._get_decollapsed_filename()
+
+        self.assertEqual(sam_out, "sam_file_decollapsed.sam")
+
+    """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?"""
+
+    def test_SAM_reader_read_thru_header(self):
+        reader = SAM_reader(decollapse=True)
+        reader._decollapsed_filename = "mock_outfile_name.sam"
+
+        with open(self.short_sam_file, 'rb') as sam_in:
+            with patch('builtins.open', mock_open()) as sam_out:
+                line = reader._read_to_first_aln(sam_in)
+
+        expected_writelines = [
+            call('mock_outfile_name.sam', 'w'),
+            call().__enter__(),
+            call().writelines(["@SQ	SN:I	LN:21\n"]),
+            call().__exit__(None, None, None)
+        ]
+
+        sam_out.assert_has_calls(expected_writelines)
+        self.assertTrue(len(reader._header_lines) == 1)
+
+    """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?"""
+
+    def test_SAM_reader_write_decollapsed_sam(self):
+        reader = SAM_reader(decollapse=True)
+        reader.collapser_type = "tiny-collapse"
+        reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")]
+        reader._decollapsed_filename = "mock_outfile_name.sam"
+
+        expected_writelines = [
+            call('mock_outfile_name.sam', 'ab'),
+            call().__enter__(),
+            call().writelines([b"mock line from SAM file"] * 5),
+            call().__exit__(None, None, None)
+        ]
+
+        with patch('builtins.open', mock_open()) as outfile:
+            reader._write_decollapsed_sam()
+
+        outfile.assert_has_calls(expected_writelines)
+        self.assertTrue(len(reader._decollapsed_reads) == 0)
+
+    """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?"""
+
+    def test_SAM_reader_parse_alignments_decollapse(self):
+        with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \
+                patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen:
+            reader = SAM_reader(decollapse=True)
+            reader._decollapsed_reads = [0] * 99999  # At 100,001, buffer will be written
+            reader.file = self.short_sam_file  # File with single alignment
+
+            with open(self.short_sam_file, 'rb') as sam_in:
+                self.exhaust_iterator(reader._parse_alignments(sam_in))
+                write_fn.assert_not_called()
+
+                # Rewind and add one more alignment to push it over threshold
+                sam_in.seek(0)
+                self.exhaust_iterator(reader._parse_alignments(sam_in))
+                write_fn.assert_called_once()
+
+    """Does SAM_reader report a single read count for non-collapsed SAM records?"""
+
+    def test_SAM_reader_single_readcount_non_collapsed_SAM(self):
+        # Read non-collapsed.sam but duplicate its single record twice
+        with open(f"{resources}/non-collapsed.sam", 'rb') as f:
+            sam_lines = f.readlines()
+            sam_lines.extend([sam_lines[1]] * 2)
+            mock_file = mock_open(read_data=b''.join(sam_lines))
+
+        with patch('tiny.rna.counter.hts_parsing.open', new=mock_file):
+            reader = SAM_reader()
+            bundle, read_count = next(reader.bundle_multi_alignments('mock_file'))
+
+        self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME')
+        self.assertEqual(len(bundle), 3)
+        self.assertEqual(read_count, 1)
+
+    """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?"""
+
+    def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self):
+        stdout_capture = io.StringIO()
+        with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \
+                patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header:
+            with contextlib.redirect_stderr(stdout_capture):
+                reader = SAM_reader(decollapse=True)
+                records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam")
+                self.exhaust_iterator(records)
+
+        write_sam.assert_not_called()
+        write_header.assert_not_called()
+        self.assertEqual(reader.collapser_type, None)
+        self.assertEqual(stdout_capture.getvalue(),
+                         "Alignments do not appear to be derived from a supported collapser input. "
+                         "Decollapsed SAM files will therefore not be produced.\n")
+
+
+class ReferenceFeaturesTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        self.gff_file = f"{resources}/identity_choice_test.gff3"
+        self.short_gff_file = f"{resources}/single.gff3"
+        self.short_gff = helpers.read(self.short_gff_file)
+
+        self.maxDiff = None
+
+    # === HELPERS ===
+
+    def get_gff_attr_string(self, gff_line):
+        return gff_line.split('\t')[-1]
+
+    def parse_gff_attr(self, gff_file_content):
+        attr_str = self.get_gff_attr_string(gff_file_content)
+        return parse_GFF_attribute_string(attr_str)
+
+    def selector_with_template(self, updates_list):
+        """Returns a MockFeatureSelector with the specified updates to the default rule template"""
+
+        rules = [deepcopy(helpers.rules_template[0]) for _ in range(len(updates_list))]
+        for changes, template in zip(updates_list, rules):
+            template.update(changes)
+        return FeatureSelector(rules)
+
+    # === TESTS ===
+
     """Were only the correct attribute keys present in the parser result?"""
 
     def test_gff_attr_keys(self):
@@ -608,12 +723,8 @@ def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self):
                 records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam")
                 self.exhaust_iterator(records)
 
-        write_sam.assert_not_called()
-        write_header.assert_not_called()
-        self.assertEqual(reader.collapser_type, None)
-        self.assertEqual(stdout_capture.getvalue(),
-                         "Alignments do not appear to be derived from a supported collapser input. "
-                         "Decollapsed SAM files will therefore not be produced.\n")
+
+class CaseInsensitiveAttrsTests(unittest.TestCase):
 
     """Does CaseInsensitiveAttrs correctly store, check membership, and retrieve?"""
 
@@ -761,6 +872,3 @@ def test_gff_megazord(self):
         # The test is passed if this command
         # completes without throwing errors.
         rt.get(fs)
-
-if __name__ == '__main__':
-    unittest.main()

From 7613db137951395ffd26d2bb01a88776b8b1af3a Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 4 Mar 2023 12:01:19 -0800
Subject: [PATCH 2/5] Adding a test suite for ReferenceSeqs. The diff on this
 commit is wildly out of control because I wrote the new test suite before
 making the previous commit

---
 tests/unit_tests_hts_parsing.py | 223 ++++++++++++++++++++------------
 1 file changed, 143 insertions(+), 80 deletions(-)

diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
index 63eb25cc..587fdee0 100644
--- a/tests/unit_tests_hts_parsing.py
+++ b/tests/unit_tests_hts_parsing.py
@@ -9,8 +9,7 @@
 from unittest.mock import patch, mock_open, call
 
 from tiny.rna.counter.features import FeatureSelector
-from tiny.rna.counter.matching import IntervalPartialMatch
-from tiny.rna.counter.statistics import LibraryStats
+from tiny.rna.counter.matching import *
 from tiny.rna.counter.hts_parsing import *
 
 import unit_test_helpers as helpers
@@ -624,104 +623,168 @@ def test_ref_tables_tagged_match_merging(self):
         self.assertDictEqual(aliases, expected_aliases)
         self.assertDictEqual(tags, expected_tags)
 
-    """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?"""
-
-    def test_SAM_reader_get_decollapsed_filename(self):
-        reader = SAM_reader()
-        reader.file = "~/path/to/input/sam_file.sam"
-
-        sam_out = reader._get_decollapsed_filename()
-
-        self.assertEqual(sam_out, "sam_file_decollapsed.sam")
 
-    """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?"""
-
-    def test_SAM_reader_read_thru_header(self):
-        reader = SAM_reader(decollapse=True)
-        reader._decollapsed_filename = "mock_outfile_name.sam"
+class ReferenceSequencesTests(unittest.TestCase):
 
-        with open(self.short_sam_file, 'rb') as sam_in:
-            with patch('builtins.open', mock_open()) as sam_out:
-                line = reader._read_to_first_aln(sam_in)
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.maxDiff = None
 
-        expected_writelines = [
-            call('mock_outfile_name.sam', 'w'),
-            call().__enter__(),
-            call().writelines(["@SQ	SN:I	LN:21\n"]),
-            call().__exit__(None, None, None)
-        ]
+    # === HELPERS ===
 
-        sam_out.assert_has_calls(expected_writelines)
-        self.assertTrue(len(reader._header_lines) == 1)
+    @staticmethod
+    def ReferenceSeqs_add(seq_id, seq_len, matches):
+        rs = ReferenceSeqs({seq_id, seq_len})
+        rs.selector = FeatureSelector([])
+        rs.add_reference_seq(seq_id, seq_len, matches)
+        return rs
 
-    """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?"""
+    @staticmethod
+    def get_steps(rs, chrom):
+        return {(step[0], step[1]): step[2]
+                  for step in rs.feats[chrom]['.'].array.get_steps()}
 
-    def test_SAM_reader_write_decollapsed_sam(self):
-        reader = SAM_reader(decollapse=True)
-        reader.collapser_type = "tiny-collapse"
-        reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")]
-        reader._decollapsed_filename = "mock_outfile_name.sam"
+    # === TESTS ===
 
-        expected_writelines = [
-            call('mock_outfile_name.sam', 'ab'),
-            call().__enter__(),
-            call().writelines([b"mock line from SAM file"] * 5),
-            call().__exit__(None, None, None)
-        ]
+    """Does add_reference_seq() produce the expected GenomicArray for an untagged rule matching a single feature?"""
 
-        with patch('builtins.open', mock_open()) as outfile:
-            reader._write_decollapsed_sam()
+    def test_add_reference_seq_single(self):
+        seq_id = "seq"
+        seq_len = 10
+        matches = {'': [(0, 0, "partial")]}
 
-        outfile.assert_has_calls(expected_writelines)
-        self.assertTrue(len(reader._decollapsed_reads) == 0)
+        rs = self.ReferenceSeqs_add(seq_id, seq_len, matches)
+        actual = self.get_steps(rs, seq_id)
 
-    """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?"""
+        # The record_tuples should have intervals on both strands
+        # and the overlap selector should have the same interval.
+        # For these selectors, same interval on both strands.
+        iv = HTSeq.GenomicInterval(seq_id, 0, seq_len)
+        match_tuple = ((0, 0, IntervalPartialMatch(iv)),)
 
-    def test_SAM_reader_parse_alignments_decollapse(self):
-        with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \
-                patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen:
+        expected = {
+            (0, 10): {
+                ((seq_id, ''), True,  match_tuple), # sense
+                ((seq_id, ''), False, match_tuple)  # antisense
+            },
+            (10, sys.maxsize): set()
+        }
 
-            reader = SAM_reader(decollapse=True)
-            reader._decollapsed_reads = [0] * 99999     # At 100,001, buffer will be written
-            reader.file = self.short_sam_file           # File with single alignment
+        self.assertDictEqual(actual, expected)
 
-            with open(self.short_sam_file, 'rb') as sam_in:
-                self.exhaust_iterator(reader._parse_alignments(sam_in))
-                write_fn.assert_not_called()
+    """Does add_reference_seq() produce the expected GenomicArray when rules share a classifier?"""
 
-                # Rewind and add one more alignment to push it over threshold
-                sam_in.seek(0)
-                self.exhaust_iterator(reader._parse_alignments(sam_in))
-                write_fn.assert_called_once()
+    def test_add_reference_seq_shared_classifier(self):
+        seq_id = "seq"
+        seq_len = 10
+        matches = {'shared': [(0, 0, "partial"), (1, 1, "nested")]}
 
-    """Does SAM_reader report a single read count for non-collapsed SAM records?"""
+        rs = self.ReferenceSeqs_add(seq_id, seq_len, matches)
+        actual = self.get_steps(rs, seq_id)
 
-    def test_SAM_reader_single_readcount_non_collapsed_SAM(self):
-        # Read non-collapsed.sam but duplicate its single record twice
-        with open(f"{resources}/non-collapsed.sam", 'rb') as f:
-            sam_lines = f.readlines()
-            sam_lines.extend([sam_lines[1]] * 2)
-            mock_file = mock_open(read_data=b''.join(sam_lines))
-
-        with patch('tiny.rna.counter.hts_parsing.open', new=mock_file):
-            reader = SAM_reader()
-            bundle, read_count = next(reader.bundle_multi_alignments('mock_file'))
+        # The record_tuples will have intervals on both strands
+        # and the overlap selector should have the same interval.
+        # For these selectors, same interval on both strands.
+        iv = HTSeq.GenomicInterval(seq_id, 0, seq_len)
+        match_tuples = (0, 0, IntervalPartialMatch(iv)), (1, 1, IntervalNestedMatch(iv))
 
-        self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME')
-        self.assertEqual(len(bundle), 3)
-        self.assertEqual(read_count, 1)
+        expected = {
+            (0, 10): {
+                ((seq_id, "shared"), True,  match_tuples),  # sense
+                ((seq_id, "shared"), False, match_tuples)   # antisense
+            },
+            (10, sys.maxsize): set()
+        }
 
-    """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?"""
+        self.assertDictEqual(actual, expected)
+
+    """Does add_reference_seq() produce the expected GenomicArray when shift parameters
+    from different rules produce the same interval?"""
+
+    def test_add_reference_seq_shared_iv(self):
+        seq_id = "seq"
+        seq_len = 10
+        matches = {'exact': [(0, 0, "exact, 2, -2")], 'nested': [(1, 1, "nested, 2, -2")]}
+
+        rs = self.ReferenceSeqs_add(seq_id, seq_len, matches)
+        actual = self.get_steps(rs, seq_id)
+
+        # The record_tuples will have intervals on both strands
+        # and the overlap selector should have the same interval.
+        # For these selectors, same interval on both strands.
+        iv = HTSeq.GenomicInterval(seq_id, 2, seq_len - 2)
+        match_exact =  ((0, 0, IntervalExactMatch(iv)),)
+        match_nested = ((1, 1, IntervalNestedMatch(iv)),)
+
+        expected = {
+            (0, 2): set(),
+            (2, 8): {
+                ((seq_id, "exact"),  True,  match_exact),   # sense
+                ((seq_id, "exact"),  False, match_exact),   # antisense
+                ((seq_id, "nested"), True,  match_nested),  # sense
+                ((seq_id, "nested"), False, match_nested)   # antisense
+            },
+            (8, sys.maxsize): set()
+        }
 
-    def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self):
-        stdout_capture = io.StringIO()
-        with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \
-                patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header:
+        self.assertDictEqual(actual, expected)
+    
+    """Does add_reference_seq() produce the expected GenomicArray when there are
+    multiple tagged rules with different shift intervals, including asymmetric shifts 
+    and an overlap selector that produces different intervals on each strand?"""
+
+    def test_add_reference_seq_complex(self):
+        seq_id = "seq"
+        seq_len = 10
+        matches = {
+            "class1": [(0, 0, "nested, 1, -1"), (0, 0, "exact, 5, 0")],
+            "class2": [(0, 0, "5' anchored, 5, 0")]
+        }
 
-            with contextlib.redirect_stderr(stdout_capture):
-                reader = SAM_reader(decollapse=True)
-                records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam")
-                self.exhaust_iterator(records)
+        rs = self.ReferenceSeqs_add(seq_id, seq_len, matches)
+        actual = self.get_steps(rs, seq_id)
+
+        # Since nested shift is symmetric, iv is same on both strands
+        iv_n = HTSeq.GenomicInterval(seq_id, 1, seq_len-1)
+        match_nested = ((0, 0, IntervalNestedMatch(iv_n)),)
+
+        # Since exact and anchored shift is asymmetric and by the same
+        # amount, iv differs per strand but is shared by both selectors.
+        # If they both had the same classifier then these match tuples
+        # would share the same record tuple on both strands
+        iv_e5_s = HTSeq.GenomicInterval(seq_id, 5, seq_len, '+')
+        match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s)),)
+        match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s)),)
+
+        iv_e5_a = HTSeq.GenomicInterval(seq_id, 0, seq_len-5, '-')
+        match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a)),)
+        match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a)),)
+
+        expected = {
+            (0, 1): {
+                (('seq', 'class2'), False, match_5anch_antis),
+                (('seq', 'class1'), False, match_exact_antis),
+            },
+            (1, 5): {
+                (('seq', 'class1'), True,  match_nested),
+                (('seq', 'class2'), False, match_5anch_antis),
+                (('seq', 'class1'), False, match_exact_antis),
+                (('seq', 'class1'), False, match_nested)
+            },
+            (5, 9): {
+                (('seq', 'class1'), True,  match_exact_sense),
+                (('seq', 'class1'), True,  match_nested),
+                (('seq', 'class2'), True,  match_5anch_sense),
+                (('seq', 'class1'), False, match_nested)
+            },
+            (9, 10): {
+                (('seq', 'class1'), True,  match_exact_sense),
+                (('seq', 'class2'), True,  match_5anch_sense)
+            },
+            (10, sys.maxsize): set()
+        }
+        
+        self.assertDictEqual(actual, expected)
 
 
 class CaseInsensitiveAttrsTests(unittest.TestCase):

From 9bdce0da680c04c987620026d18f08d535a0de0f Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 4 Mar 2023 12:31:50 -0800
Subject: [PATCH 3/5] Adding support for classifiers in sequence-based
 counting. The output format of get_matches() is consistent with that of
 ReferenceFeatures.get_matches(), though that doesn't factor into the current
 design. The procedure in add_reference_seq() closely resembles the
 established procedure used by ReferenceFeatures in _add_subinterval_matches()

---
 tiny/rna/counter/hts_parsing.py | 48 +++++++++++++++++----------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index eefe28d8..cf563aa6 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -817,16 +817,16 @@ class ReferenceSeqs(ReferenceBase):
 
     def __init__(self, reference_seqs, **prefs):
         super().__init__(**prefs)
+        self.tags = defaultdict(set)
         self.seqs = reference_seqs
         self.alias = {}
-        self.tags = {}
 
     def get(self, selector):
         self.selector = selector
-        match_tuples = self.get_matches()
+        matches = self.get_matches()
 
         for seq_id, seq_len in self.seqs.items():
-            self.add_reference_seq(seq_id, seq_len, match_tuples)
+            self.add_reference_seq(seq_id, seq_len, matches)
 
         # GenomicArray is built. Clear cache...
         self.selector.overlap_cache.clear()
@@ -838,30 +838,32 @@ def get(self, selector):
         aliases = {seq_id: () for seq_id in self.tags}
         return self.feats, aliases, self.tags
 
-    def get_matches(self):
+    def get_matches(self) -> Dict[str, List[Tuple[int, int, str]]]:
         """Stage 1 selection is skipped in sequence-based counting.
-        Simply build match_tuples for all rules. These will be used
+        Want the reference sequences to enter Stage 2 as though they
+        were features that matched every rule in Stage 1. Simply return
+        match_tuples and their classifier for all rules. These are used
         uniformly in each reference sequence's feature_record_tuple"""
 
-        return [(idx, rule['Hierarchy'], rule['Overlap'])
-                for idx, rule in sorted(
-                    enumerate(self.selector.rules_table),
-                    key=lambda x: x[1]['Hierarchy']
-                )]
+        matches_by_classifier = defaultdict(list)
 
-    def add_reference_seq(self, seq_id, seq_len, matches):
+        for idx, rule in enumerate(self.selector.rules_table):
+            match_tuple = (idx, rule['Hierarchy'], rule['Overlap'])
+            matches_by_classifier[rule['Class']].append(match_tuple)
 
-        # Features are classified in Reference Tables (Stage 1 selection)
-        # For compatibility, use the seq_id with an empty classifier (index 1)
-        tagged_id = (seq_id, '')
-        self.tags[seq_id] = {tagged_id}
+        return matches_by_classifier
 
-        for strand in ('+', '-'):
-            iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand)
-            matches_by_shifted_iv = self.selector.build_interval_selectors(iv, matches, tagged_id)
-            strand = self.map_strand(strand)
+    def add_reference_seq(self, seq_id, seq_len, matches_by_classifier):
+        for classifier, matches in matches_by_classifier.items():
+            tagged_id = (seq_id, classifier)
+            self.tags[seq_id].add(tagged_id)
 
-            for shifted_iv, built_matches in matches_by_shifted_iv.items():
-                # Sort match tuples by rank for more efficient feature selection
-                sorted_matches = sorted(built_matches, key=lambda x: x[1])
-                self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches))
+            for strand in ('+', '-'):
+                iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand)
+                matches_by_shifted_iv = self.selector.build_interval_selectors(iv, matches, tagged_id)
+                strand = self.map_strand(strand)
+
+                for shifted_iv, built_matches in matches_by_shifted_iv.items():
+                    # Sort match tuples by rank for more efficient feature selection
+                    sorted_matches = sorted(built_matches, key=lambda x: x[1])
+                    self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches))

From 435ca6f8cb34d0daf03b61b53af1de663c4f6277 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 4 Mar 2023 12:46:22 -0800
Subject: [PATCH 4/5] The validation step for GFFs and SAM @SQ headers now
 reports when validation is complete

---
 tiny/rna/counter/validation.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index 45f331f8..43fd2af5 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -33,9 +33,11 @@ def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None):
         self.gff_files = gff_files
 
     def validate(self):
-        print("Validating annotation files...")
+        print("Validating annotation files... ", end='')
         self.parse_and_validate_gffs(self.gff_files)
         self.validate_chroms(*self.seq_files)
+        print("done.")
+
         self.report.print_report()
         if self.report.errors:
             sys.exit(1)
@@ -243,9 +245,11 @@ def __init__(self, sam_files):
         self.sq_headers = {}
 
     def validate(self):
-        print("Validating sequence identifiers in SAM files...")
+        print("Validating sequence identifiers in SAM files... ", end='')
         self.read_sq_headers()
         self.validate_sq_headers()
+        print("done.")
+
         self.report.print_report()
         if self.report.errors:
             sys.exit(1)

From 6b0913a43e33d4f2df493442deda3c8f288741d1 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 4 Mar 2023 12:48:05 -0800
Subject: [PATCH 5/5] Updating documentation to note that classifiers are used
 during sequence-based counting

---
 doc/tiny-count.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tiny-count.md b/doc/tiny-count.md
index 219af31a..96438d47 100644
--- a/doc/tiny-count.md
+++ b/doc/tiny-count.md
@@ -27,7 +27,7 @@ Selection occurs in three stages, with the output of each stage as input to the
 ![Feature Selection Diagram](../images/tiny-count_selection.png)
 
 ## Sequence-Based Counting Mode
-If GFF files aren't specified in your Paths File, Stage 1 selection is skipped and reads are counted by sequence rather than by feature. Reference sequence names and lengths are determined from the `@SQ` headers of input SAM files. In Stages 2 and 3, these reference sequences behave like features that had matched every rule in Stage 1.
+If GFF files aren't specified in your Paths File, Stage 1 selection is skipped and reads are counted by sequence rather than by feature. Reference sequence names and lengths are determined from the `@SQ` headers of input SAM files. In Stages 2 and 3, these reference sequences behave like features that had matched every rule in Stage 1. Selection takes place for both the sense and antisense copy of these sequences, and read counts are subset per-sequence if any `Classify as...` values are provided.
  
 ## Stage 1: Feature Attribute Parameters
 | _Features Sheet Selectors:_ | Select for... | with value... | Classify as... | Source Filter | Type Filter |