From 8b1114a9691652e4877bfdb138ea4445e5136b0d Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:00:37 -0800
Subject: [PATCH 01/21] Updating PathsFile class to no longer treat gff_files
 as required, and to skip empty paths under the gff_files key

---
 tiny/rna/configuration.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
index e2cdb2db..88f00a85 100644
--- a/tiny/rna/configuration.py
+++ b/tiny/rna/configuration.py
@@ -449,7 +449,7 @@ class PathsFile(ConfigBase):
     """
 
     # Parameter types
-    required = ('samples_csv', 'features_csv', 'gff_files')
+    required = ('samples_csv', 'features_csv')
     single =   ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta')
     groups =   ('reference_genome_files', 'gff_files')
     prefix =   ('ebwt', 'run_directory', 'tmp_directory')
@@ -508,9 +508,10 @@ def validate_paths(self):
             "The following parameters are required in {selfname}: {params}" \
             .format(selfname=self.basename, params=', '.join(self.required))
 
-        assert any(gff.get('path') for gff in self['gff_files']), \
-            "At least one GFF file path must be specified under gff_files in {selfname}" \
-            .format(selfname=self.basename)
+        if not any(gff.get('path') for gff in self['gff_files']):
+            print("No GFF files were specified in {selfname} so "
+                  "reads will be counted in non-genomic mode."
+                  .format(selfname=self.basename))
 
         # Some entries in Paths File are omitted from tiny-count's working directory during
         #  pipeline runs. There is no advantage to checking file existence here vs. in load_*
@@ -526,6 +527,7 @@ def validate_paths(self):
         for key in self.groups:
             for entry in self[key]:
                 if isinstance(entry, dict): entry = entry['path']
+                if not entry: continue
                 assert os.path.isfile(entry), \
                     "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \
                     .format(key=key, selfname=self.basename, file=entry)

From 0b3cc407ba0e77c91a190e6d25c947ce0e1cdd0f Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:02:41 -0800
Subject: [PATCH 02/21] Minor refactor to match the order of enumerate()'s
 (index, element) outputs since this code is essentially an enumerate() with
 an upper limit

---
 tiny/rna/counter/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index dbc6d5cc..ae8c757c 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -241,7 +241,7 @@ def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size=
         for file in sam_files:
             file_chroms = set()
             with open(file, 'rb') as f:
-                for line, i in zip(f, range(subset_size)):
+                for i, line in zip(range(subset_size), f):
                     if line[0] == ord('@'): continue
                     file_chroms.add(line.split(b'\t')[2].strip().decode())
                     if i % 10000 == 0 and len(file_chroms & self.chrom_set): break

From bd168cca3cb596be1d6a06a0d0fe1b9c6ea0b542 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:04:28 -0800
Subject: [PATCH 03/21] Updating tiny-count's load_gff_files() to treat GFF
 files as optional

---
 tiny/rna/counter/counter.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
index a20dd3a1..1d4cbe41 100644
--- a/tiny/rna/counter/counter.py
+++ b/tiny/rna/counter/counter.py
@@ -178,23 +178,21 @@ def load_gff_files(paths: PathsFile, libraries: List[dict], rules: List[dict]) -
 
     # Build dictionary of unique files and allowed aliases
     for gff in paths['gff_files']:
-        gff_files[gff['path']].extend(
-            alias for alias in gff.get('alias', ())
-            if alias.lower() not in ignore_alias
-        )
+        if gff['path'] is not None:
+            gff_files[gff['path']].extend(
+                alias for alias in gff.get('alias', ())
+                if alias.lower() not in ignore_alias
+            )
 
     # Remove duplicate aliases (per file), keep original order
     for file, alias in gff_files.items():
         gff_files[file] = sorted(set(alias), key=alias.index)
 
-    # Prepare supporting file inputs for GFF validation
-    sam_files = [lib['File'] for lib in libraries]
+    if gff_files:
+        # Prepare supporting file inputs for GFF validation
+        sam_files = [lib['File'] for lib in libraries]
+        GFFValidator(gff_files, rules, alignments=sam_files).validate()
 
-    # Todo: update GFFValidator so that genomes and ebwt can be safely passed here
-    # ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']]
-    # ebwt_prefix = map_path(paths['ebwt'])
-
-    GFFValidator(gff_files, rules, alignments=sam_files).validate()
     return gff_files
 
 
@@ -235,7 +233,7 @@ def main():
 
         # global for multiprocessing
         global counter
-        counter = FeatureCounter(gff_files, selection, **args)
+        counter = FeatureCounter(gff_files, libraries, selection, **args)
 
         # Assign and count features using multiprocessing and merge results
         merged_counts = map_and_reduce(libraries, paths, args)

From 46ffd44af336f0a846dc7b1cd365f02e217dfb0f Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:05:47 -0800
Subject: [PATCH 04/21] Updating SAM_reader to properly store header data when
 multiple header lines start with the same flag

---
 tiny/rna/counter/hts_parsing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 031f6e7b..39341791 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -38,7 +38,7 @@ def __init__(self, **prefs):
         self._decollapsed_filename = None
         self._decollapsed_reads = []
         self._header_lines = []
-        self._header_dict = {}
+        self._header_dict = defaultdict(list)
 
     def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]:
         """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count"""
@@ -142,10 +142,11 @@ def _parse_header_line(self, header_line: str) -> None:
         fields = header_line[3:].strip().split('\t')
 
         if rec_type == "@CO":
-            self._header_dict[rec_type] = fields[0]
+            self._header_dict[rec_type].append(fields[0])
         else:
-            self._header_dict[rec_type] = \
+            self._header_dict[rec_type].append(
                 {field[:2]: field[3:].strip() for field in fields}
+            )
 
     def _determine_collapser_type(self, first_aln_line: str) -> None:
         """Attempts to determine the collapsing utility that was used before producing the
@@ -157,7 +158,7 @@ def _determine_collapser_type(self, first_aln_line: str) -> None:
         elif re.match(_re_fastx, first_aln_line) is not None:
             self.collapser_type = "fastx"
 
-            sort_order = self._header_dict.get('@HD', {}).get('SO', None)
+            sort_order = self._header_dict.get('@HD', [{}])[0].get('SO', None)
             if sort_order is None or sort_order != "queryname":
                 raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n"
                                  "(and the @HD [...] SO header must be set accordingly).")

From e7c6c7e89021c3803b8ef06225ac359e70e5b856 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:08:33 -0800
Subject: [PATCH 05/21] Refactoring ReferenceTables to use a base class for
 functionality that is shared between it and the new class for non-genomic
 references. The overlap between the two is pretty much the
 GenomicArray/StepVector and related functions

---
 tiny/rna/counter/hts_parsing.py | 115 +++++++++++++++++---------------
 1 file changed, 62 insertions(+), 53 deletions(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 39341791..0769a393 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -6,6 +6,7 @@
 
 from collections import Counter, defaultdict
 from typing import Tuple, List, Dict, Iterator, Optional, DefaultDict, Set, Union, IO, Callable
+from abc import ABC, abstractmethod
 from urllib.parse import unquote
 from inspect import stack
 
@@ -409,6 +410,58 @@ def not_implemented(self):
         raise NotImplementedError(f"CaseInsensitiveAttrs does not support {stack()[1].function}")
 
 
+class AnnotationParsing(ABC):
+    def __init__(self, feature_selector, **prefs):
+        self.stepvector = prefs.get('stepvector', 'HTSeq')
+        self.selector = feature_selector
+        self.feats = self._init_genomic_array()
+
+    @abstractmethod
+    def get(self): pass
+
+    def _init_genomic_array(self):
+        if self.stepvector == 'Cython':
+            try:
+                from tiny.rna.counter.stepvector import StepVector
+                setattr(HTSeq.StepVector, 'StepVector', StepVector)
+                return HTSeq.GenomicArray("auto", stranded=False)
+            except ModuleNotFoundError:
+                self.stepvector = 'HTSeq'
+                print("Failed to import Cython StepVector\n"
+                      "Falling back to HTSeq's StepVector",
+                      file=sys.stderr)
+
+        if self.stepvector == 'HTSeq':
+            return HTSeq.GenomicArrayOfSets("auto", stranded=False)
+
+    def chrom_vector_setdefault(self, chrom):
+        """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are
+        dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets"""
+
+        if chrom not in self.feats.chrom_vectors:
+            self.feats.add_chrom(chrom)
+
+    def get_feats_table_size(self) -> int:
+        """Returns the sum of features across all chromosomes and strands"""
+
+        total_feats = 0
+        empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector]
+        for chrom in self.feats.chrom_vectors:
+            for strand in self.feats.chrom_vectors[chrom]:
+                total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size
+
+        return total_feats
+
+    @staticmethod
+    def map_strand(htseq_value: str):
+        """Maps HTSeq's strand representation (+/-/.) to
+        tinyRNA's strand representation (True/False/None)"""
+
+        return {
+            '+': True,
+            '-': False,
+        }.get(htseq_value, None)
+
 def parse_gff(file, row_fn: Callable, alias_keys=None):
     if alias_keys is not None:
         row_fn = functools.partial(row_fn, alias_keys=alias_keys)
@@ -433,8 +486,7 @@ def parse_gff(file, row_fn: Callable, alias_keys=None):
 TagTable = DefaultDict[str, Set[Tuple[str, str]]]
 GenomicArray = HTSeq.GenomicArrayOfSets
 
-
-class ReferenceTables:
+class ReferenceTables(AnnotationParsing):
     """A GFF parser which builds feature, alias, and class reference tables
 
     Discontinuous features are supported, and comma-separated attribute values (GFF3 column 9)
@@ -462,17 +514,16 @@ class ReferenceTables:
     type_filter = []
 
     def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs):
+        super().__init__(feature_selector, **prefs)
         self.all_features = prefs.get('all_features', False)
-        self.stepvector = prefs.get('stepvector', 'HTSeq')
-        self.selector = feature_selector
         self.gff_files = gff_files
-        # ----------------------------------------------------------- Primary Key:
-        self.feats = self._init_genomic_array()                         # Root Match ID
-        self.parents, self.filtered = {}, set()                         # Original Feature ID
-        self.intervals = defaultdict(list)                              # Root Feature ID
-        self.matches = defaultdict(set)                                 # Root Match ID
-        self.alias = defaultdict(set)                                   # Root Feature ID
-        self.tags = defaultdict(set)                                    # Root Feature ID -> Root Match ID
+        # ----------------------------------------------- Primary Key:
+        # self.feats                                        # Root Match ID
+        self.parents, self.filtered = {}, set()             # Original Feature ID
+        self.intervals = defaultdict(list)                  # Root Feature ID
+        self.matches = defaultdict(set)                     # Root Match ID
+        self.alias = defaultdict(set)                       # Root Feature ID
+        self.tags = defaultdict(set)                        # Root Feature ID -> Root Match ID
 
         # Patch the GFF attribute parser to support comma separated attribute value lists
         setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string))
@@ -664,26 +715,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval])
 
         return merged_ivs
 
-    def get_feats_table_size(self) -> int:
-        """Returns the sum of features across all chromosomes and strands"""
-
-        total_feats = 0
-        empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector]
-        for chrom in self.feats.chrom_vectors:
-            for strand in self.feats.chrom_vectors[chrom]:
-                total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size
-
-        return total_feats
-
-    def map_strand(self, gff_value: str):
-        """Maps HTSeq's strand representation (+/-/.) to
-        tinyRNA's strand representation (True/False/None)"""
-
-        return {
-            '+': True,
-            '-': False,
-        }.get(gff_value, None)
-
     def was_matched(self, untagged_id):
         """Checks if the feature ID previously matched on identity, regardless of whether
         the matching rule was tagged or untagged."""
@@ -691,13 +722,6 @@ def was_matched(self, untagged_id):
         # any() will short circuit on first match when provided a generator function
         return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ()))
 
-    def chrom_vector_setdefault(self, chrom):
-        """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are
-        dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets"""
-
-        if chrom not in self.feats.chrom_vectors:
-            self.feats.add_chrom(chrom)
-
     @staticmethod
     def get_feature_id(row):
         id_collection = row.attr.get('ID') \
@@ -713,21 +737,6 @@ def get_feature_id(row):
 
         return id_collection[0]
 
-    def _init_genomic_array(self):
-        if self.stepvector == 'Cython':
-            try:
-                from tiny.rna.counter.stepvector import StepVector
-                setattr(HTSeq.StepVector, 'StepVector', StepVector)
-                return HTSeq.GenomicArray("auto", stranded=False)
-            except ModuleNotFoundError:
-                self.stepvector = 'HTSeq'
-                print("Failed to import Cython StepVector\n"
-                      "Falling back to HTSeq's StepVector",
-                      file=sys.stderr)
-
-        if self.stepvector == 'HTSeq':
-            return HTSeq.GenomicArrayOfSets("auto", stranded=False)
-
     @staticmethod
     def column_filter_match(row, rule):
         """Checks if the GFF row passes the inclusive filter(s)

From 9841d6e7995131767740472bd83d3b9254289b8e Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:15:12 -0800
Subject: [PATCH 06/21] Adding a new class that is essentially the equivalent
 of ReferenceTables but for non-GFF read counting. It produces a GenomicArray
 of reference sequences, where there is one "chromosome" for each sequence,
 and each chromosome contains an entry for the sequence on the + and - strand.

Tags and aliases don't apply in this mode, but they are returned nonetheless for compatibility
---
 tiny/rna/counter/hts_parsing.py | 74 +++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 0769a393..c89250ae 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -743,3 +743,77 @@ def column_filter_match(row, rule):
         If both filters are defined then they must both evaluate to true for a match"""
 
         return row.source in rule['Filter_s'] and row.type in rule['Filter_t']
+
+
+class NonGenomicAnnotations(AnnotationParsing):
+
+    def __init__(self, sam_files, feature_selector, **prefs):
+        super().__init__(feature_selector, **prefs)
+        self.sam_files = sam_files
+        self.alias = {}
+        self.tags = {}
+
+    @report_execution_time("Non-genomic annotations parsing")
+    def get(self):
+        ref_seqs = self.get_reference_seq_definitions()
+        unbuilt_match_tuples = self.get_matches()
+
+        for seq_id, seq_len in ref_seqs.items():
+            self.add_reference_seq(seq_id, seq_len, unbuilt_match_tuples)
+
+        # Aliases are irrelevant for non-GFF references
+        aliases = {seq_id: () for seq_id in self.tags}
+        return self.feats, aliases, self.tags
+
+    def get_reference_seq_definitions(self) -> Dict[str,int]:
+        reader = SAM_reader()
+        for sam in self.sam_files:
+            with open(sam['File'], 'rb') as f:
+                reader._read_to_first_aln(f)
+
+        ref_seqs = defaultdict(set)
+        for sq in reader._header_dict['@SQ']:
+            seq_id = sq['SN']
+            seq_len = int(sq['LN'])
+            ref_seqs[seq_id].add(seq_len)
+
+        return self.get_seq_len_consensus(ref_seqs)
+
+    @staticmethod
+    def get_seq_len_consensus(ref_seqs):
+        consensus = {}
+        for seq_id, lengths in ref_seqs.items():
+            if len(lengths) > 1:
+                msg = f"Reference sequence identifier {seq_id} " \
+                      "was given multiple lengths in your SAM files. " \
+                      "This might indicate that your files were produced " \
+                      "by alignments that used different indexes."
+                raise ValueError(msg)
+            consensus[seq_id] = lengths.pop()
+
+        return consensus
+
+    def get_matches(self):
+        """Stage 1 selection is skipped in non-genomic counting.
+        Simply build match_tuples for all rules. These will be used
+        uniformly in each reference sequence's feature_record_tuple"""
+
+        return [(idx, rule['Hierarchy'], rule['Overlap'])
+                for idx, rule in sorted(
+                    enumerate(self.selector.rules_table),
+                    key=lambda x: x[1]['Hierarchy']
+                )]
+
+    def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples):
+
+        # Features are classified in Reference Tables (Stage 1 selection)
+        # For compatibility, use the seq_id with an empty classifier (index 1)
+        tagged_id = (seq_id,)
+        self.tags[seq_id] = {tagged_id}
+
+        for strand in ('+', '-'):
+            iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand)
+            match_tuples = self.selector.build_interval_selectors(iv, unbuilt_match_tuples.copy())
+            tinyrna_strand = self.map_strand(strand)
+
+            self.feats[iv] += (tagged_id, tinyrna_strand, tuple(match_tuples))
\ No newline at end of file

From 3f88265b88e0f1f8340d6f619f32573c4b932c8f Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 27 Jan 2023 14:17:22 -0800
Subject: [PATCH 07/21] Updating FeatureCounter to switch between the two
 reference annotation classes based on the presence of GFF files. Also
 changing the order in which assign_features and count_reads occur in the
 class because my preference has changed since it was written

---
 tiny/rna/counter/features.py | 46 ++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index b17c0cdc..73cd540f 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from typing import List, Tuple, Set, Dict, Mapping
 
-from tiny.rna.counter.hts_parsing import ReferenceTables, SAM_reader
+from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, SAM_reader
 from .statistics import LibraryStats
 from .matching import *
 
@@ -27,30 +27,20 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict
 
 class FeatureCounter:
 
-    def __init__(self, gff_file_set, selection_rules, **prefs):
+    def __init__(self, gff_file_set, sam_file_set, selection_rules, **prefs):
         self.stats = LibraryStats(**prefs)
         self.sam_reader = SAM_reader(**prefs)
         self.selector = FeatureSelector(selection_rules, self.stats, **prefs)
 
-        reference_tables = ReferenceTables(gff_file_set, self.selector, **prefs)
-        Features(*reference_tables.get())
-        self.prefs = prefs
+        if gff_file_set:
+            self.mode = "genomic"
+            annotations = ReferenceTables(gff_file_set, self.selector, **prefs)
+        else:
+            self.mode = "non-genomic"
+            annotations = NonGenomicAnnotations(sam_file_set, self.selector, **prefs)
 
-    def assign_features(self, al: dict) -> Tuple[dict, int]:
-        """Determines features associated with the interval then performs rule-based feature selection"""
-
-        try:
-            feat_matches = set().union(
-                            *Features.chrom_vectors[al['Chrom']]['.']  # GenomicArrayOfSets -> ChromVector
-                                     .array[al['Start']:al['End']]     # ChromVector -> StepVector
-                                     .get_steps(values_only=True))     # StepVector -> {features}
-        except KeyError as ke:
-            self.stats.chrom_misses[ke.args[0]] += 1
-            return {}, 0
-
-        # If features are associated with the alignment interval, perform selection
-        assignment = self.selector.choose(feat_matches, al) if feat_matches else {}
-        return assignment, len(feat_matches)
+        Features(*annotations.get())
+        self.prefs = prefs
 
     def count_reads(self, library: dict):
         """Collects statistics on features assigned to each alignment associated with each read"""
@@ -71,6 +61,22 @@ def count_reads(self, library: dict):
 
         return self.stats
 
+    def assign_features(self, al: dict) -> Tuple[dict, int]:
+        """Determines features associated with the interval then performs rule-based feature selection"""
+
+        try:
+            feat_matches = set().union(
+                            *Features.chrom_vectors[al['Chrom']]['.']  # GenomicArrayOfSets -> ChromVector
+                                     .array[al['Start']:al['End']]     # ChromVector -> StepVector
+                                     .get_steps(values_only=True))     # StepVector -> {features}
+        except KeyError as ke:
+            self.stats.chrom_misses[ke.args[0]] += 1
+            return {}, 0
+
+        # If features are associated with the alignment interval, perform selection
+        assignment = self.selector.choose(feat_matches, al) if feat_matches else {}
+        return assignment, len(feat_matches)
+
 
 class FeatureSelector:
     """Performs hierarchical selection given a set of candidate features for a locus

From f59ef1fc2d56f8602d0e55887650169ef2f2b3ec Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 16:44:04 -0800
Subject: [PATCH 08/21] SAM file headers need some basic validation now that
 we're relying on them for non-genomic counting. The validating regex pattern
 was copied from the Aug. '22 version of the SAM v1 specification

---
 tiny/rna/counter/hts_parsing.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index c89250ae..c9343c32 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -21,7 +21,8 @@
 AlignmentDict = Dict[str, Union[str, int, bytes]]
 Bundle = Tuple[List[AlignmentDict], int]
 _re_tiny = r"\d+_count=\d+"
-_re_fastx = r'seq\d+_x(\d+)'
+_re_fastx = r"seq\d+_x(\d+)"
+_re_header = re.compile(r"^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$|^@CO\t.*")
 
 # For Alignment
 complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'}
@@ -136,6 +137,7 @@ def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]:
     def _parse_header_line(self, header_line: str) -> None:
         """Parses and saves information from the provided header line"""
 
+        self.validate_header_line(header_line)
         self._header_lines.append(header_line)
 
         # Header dict
@@ -149,6 +151,13 @@ def _parse_header_line(self, header_line: str) -> None:
                 {field[:2]: field[3:].strip() for field in fields}
             )
 
+    def validate_header_line(self, line):
+        if not re.match(_re_header, line):
+            msg = "Invalid SAM header"
+            msg += f" in {os.path.basename(self.file)}" if self.file else ""
+            msg += ':\n\t' + f'"{line}"'
+            raise ValueError(msg)
+
     def _determine_collapser_type(self, first_aln_line: str) -> None:
         """Attempts to determine the collapsing utility that was used before producing the
         input alignment file, then checks basic requirements for the utility's outputs."""

From 15e27952c13757236ed27333b0e827c82bfe8483 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 16:52:30 -0800
Subject: [PATCH 09/21] Restructuring AnnotationParsing classes so that
 self.selector is assigned during calls to .get(). This would allow us to
 create groups of GFF files, each with a distinct Stage 1 ruleset, that would
 be pooled in the same tables.

Also changing the approach with validation/parsing of SAM headers for non-genomic counting. Since @SQ headers could be very abundant, I'm reusing the parsing results from validation rather than reparsing. NonGenomicAnnotations therefore is now constructed with a dictionary of {sequence_id: sequence_length} which is produced during successful validation.
---
 tiny/rna/counter/hts_parsing.py | 63 +++++++++++----------------------
 tiny/rna/counter/validation.py  |  2 +-
 2 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index c9343c32..3c5b2ea4 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -420,13 +420,18 @@ def not_implemented(self):
 
 
 class AnnotationParsing(ABC):
-    def __init__(self, feature_selector, **prefs):
+    def __init__(self, **prefs):
         self.stepvector = prefs.get('stepvector', 'HTSeq')
-        self.selector = feature_selector
         self.feats = self._init_genomic_array()
 
+        # The selector is assigned whenever get() is called.
+        # While it isn't the current use case, this would allow
+        # for groups of GFF files to be processed with different
+        # Stage 1 selection rules and pooled into the same tables
+        self.selector = None
+
     @abstractmethod
-    def get(self): pass
+    def get(self, feature_selector): pass
 
     def _init_genomic_array(self):
         if self.stepvector == 'Cython':
@@ -471,6 +476,7 @@ def map_strand(htseq_value: str):
             '-': False,
         }.get(htseq_value, None)
 
+
 def parse_gff(file, row_fn: Callable, alias_keys=None):
     if alias_keys is not None:
         row_fn = functools.partial(row_fn, alias_keys=alias_keys)
@@ -522,8 +528,8 @@ class ReferenceTables(AnnotationParsing):
     source_filter = []
     type_filter = []
 
-    def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs):
-        super().__init__(feature_selector, **prefs)
+    def __init__(self, gff_files: Dict[str, list], **prefs):
+        super().__init__(**prefs)
         self.all_features = prefs.get('all_features', False)
         self.gff_files = gff_files
         # ----------------------------------------------- Primary Key:
@@ -538,9 +544,10 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs):
         setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string))
 
     @report_execution_time("GFF parsing")
-    def get(self) -> Tuple[GenomicArray, AliasTable, TagTable]:
+    def get(self, feature_selector) -> Tuple[GenomicArray, AliasTable, TagTable]:
         """Initiates GFF parsing and returns complete feature, alias, and tag tables"""
 
+        self.selector = feature_selector
         for file, alias_keys in self.gff_files.items():
             parse_gff(file, self.parse_row, alias_keys=alias_keys)
 
@@ -756,52 +763,24 @@ def column_filter_match(row, rule):
 
 class NonGenomicAnnotations(AnnotationParsing):
 
-    def __init__(self, sam_files, feature_selector, **prefs):
-        super().__init__(feature_selector, **prefs)
-        self.sam_files = sam_files
+    def __init__(self, reference_seqs, **prefs):
+        super().__init__(**prefs)
+        self.seqs = reference_seqs
         self.alias = {}
         self.tags = {}
 
     @report_execution_time("Non-genomic annotations parsing")
-    def get(self):
-        ref_seqs = self.get_reference_seq_definitions()
-        unbuilt_match_tuples = self.get_matches()
+    def get(self, selector):
+        self.selector = selector
+        match_tuples = self.get_matches()
 
-        for seq_id, seq_len in ref_seqs.items():
-            self.add_reference_seq(seq_id, seq_len, unbuilt_match_tuples)
+        for seq_id, seq_len in self.seqs.items():
+            self.add_reference_seq(seq_id, seq_len, match_tuples)
 
         # Aliases are irrelevant for non-GFF references
         aliases = {seq_id: () for seq_id in self.tags}
         return self.feats, aliases, self.tags
 
-    def get_reference_seq_definitions(self) -> Dict[str,int]:
-        reader = SAM_reader()
-        for sam in self.sam_files:
-            with open(sam['File'], 'rb') as f:
-                reader._read_to_first_aln(f)
-
-        ref_seqs = defaultdict(set)
-        for sq in reader._header_dict['@SQ']:
-            seq_id = sq['SN']
-            seq_len = int(sq['LN'])
-            ref_seqs[seq_id].add(seq_len)
-
-        return self.get_seq_len_consensus(ref_seqs)
-
-    @staticmethod
-    def get_seq_len_consensus(ref_seqs):
-        consensus = {}
-        for seq_id, lengths in ref_seqs.items():
-            if len(lengths) > 1:
-                msg = f"Reference sequence identifier {seq_id} " \
-                      "was given multiple lengths in your SAM files. " \
-                      "This might indicate that your files were produced " \
-                      "by alignments that used different indexes."
-                raise ValueError(msg)
-            consensus[seq_id] = lengths.pop()
-
-        return consensus
-
     def get_matches(self):
         """Stage 1 selection is skipped in non-genomic counting.
         Simply build match_tuples for all rules. These will be used
diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index ae8c757c..e2825920 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -87,7 +87,7 @@ class GFFValidator:
     }
 
     def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None):
-        self.ReferenceTables = ReferenceTables(gff_files, None)
+        self.ReferenceTables = ReferenceTables(gff_files)
         self.column_filters = self.build_column_filters(rules)
         self.report = ReportFormatter(self.targets)
         self.chrom_set = set()

From a44743f76201670954503c5632cbcd240a888728 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:00:36 -0800
Subject: [PATCH 10/21] Adding SamSqValidator which follows the same design
 patterns as GFFValidator. @SQ headers are first validated for syntax (error
 on missing or incomplete @SQ header). Next they are checked for duplicate
 entries under the same identifier in each file, and for inconsistent length
 definitions between files. This is intended to catch issues that might arise
 from standalone use of tiny-count with third party SAM files, which may have
 been produced by more than one alignment event using different indexes

---
 tiny/rna/counter/validation.py | 125 ++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 3 deletions(-)

diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index e2825920..bf83f40a 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -6,9 +6,9 @@
 from collections import Counter, defaultdict
 from typing import List, Dict
 
-from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables
+from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables, SAM_reader
 from tiny.rna.counter.features import FeatureSelector
-from tiny.rna.util import sorted_natural, gzip_open
+from tiny.rna.util import sorted_natural, gzip_open, report_execution_time
 
 
 class ReportFormatter:
@@ -286,4 +286,123 @@ def build_column_filters(rules):
             for selector in ["Filter_s", "Filter_t"]
         }
 
-        return FeatureSelector.build_selectors([selector_defs])[0]
\ No newline at end of file
+        return FeatureSelector.build_selectors([selector_defs])[0]
+
+
+class SamSqValidator:
+    """Validates @SQ headers for tiny-count's non-genomic counting mode"""
+
+    targets = {
+        "inter sq": "Sequence identifiers with inconsistent lengths",
+        "intra sq": "SAM files with repeated sequence identifiers",
+        "incomplete sq": "SAM files with incomplete @SQ headers",
+        "missing sq": "SAM files that lack @SQ headers"
+    }
+
+    def __init__(self, sam_files):
+        self.report = ReportFormatter(self.targets)
+        self.sam_files = sam_files
+        self.reference_seqs = {}
+        self.sq_headers = {}
+
+    @report_execution_time("Non-genomic annotations validation")
+    def validate(self):
+        print("Validating sequence identifiers in SAM files...")
+        self.read_sq_headers()
+        self.validate_sq_headers()
+        self.report.print_report()
+        if self.report.errors:
+            sys.exit(1)
+
+    def validate_sq_headers(self):
+        """Performs validation tests in the required order
+        It is important to check for and return upon syntax error, otherwise
+        the remaining tests are likely to raise an exception before the report
+        has a chance to be printed"""
+
+        # First verify @SQ header syntax
+        missing = self.get_missing_headers()
+        incomplete = self.get_incomplete_sq_headers()
+        if missing or incomplete:
+            self.generate_header_syntax_report(missing, incomplete)
+            return
+
+        # Next verify identifier definitions
+        duplicate = self.get_duplicate_identifiers()
+        ambiguous = self.get_ambiguous_lengths()
+        if duplicate or ambiguous:
+            self.generate_identifier_report(duplicate, ambiguous)
+
+    def get_ambiguous_lengths(self) -> List[str]:
+        """Returns a list of sequence IDs that have inconsistent length definitions.
+        Sequences with consistent length definitions are added to self.reference_seqs"""
+
+        seq_lengths = defaultdict(set)
+        for sam in self.sam_files:
+            for sq in self.sq_headers[sam]:
+                seq_id = sq['SN']
+                seq_len = int(sq['LN'])
+                seq_lengths[seq_id].add(seq_len)
+
+        bad_seqs = []
+        for seq_id, lengths in seq_lengths.items():
+            if len(lengths) == 1:
+                lengths = lengths.pop()
+                self.reference_seqs[seq_id] = lengths
+            else:
+                bad_seqs.append(seq_id)
+
+        return bad_seqs
+
+    def get_duplicate_identifiers(self) -> Dict[str, List[str]]:
+        """Returns a dictionary of SAM files that contain duplicate sequence identifiers"""
+
+        bad_files = {}
+        for file in self.sam_files:
+            sq = self.sq_headers[file]
+            id_count = Counter(seq['SN'] for seq in sq)
+            duplicates = [seq_id for seq_id, count in id_count.items() if count > 1]
+            if duplicates: bad_files[file] = duplicates
+
+        return bad_files
+
+    def get_incomplete_sq_headers(self) -> List[str]:
+        """Returns a list of SAM files that have incomplete @SQ headers"""
+
+        return [file for file, sqs in self.sq_headers.items()
+                if not all("SN" in sq and "LN" in sq for sq in sqs)]
+
+    def get_missing_headers(self) -> List[str]:
+        """Returns a list of SAM files that lack @SQ headers"""
+
+        return [file for file, sqs in self.sq_headers.items()
+                if len(sqs) == 0]
+
+    def generate_header_syntax_report(self, missing, incomplete):
+        report = {}
+        if missing:
+            report['missing sq'] = sorted_natural(missing)
+        if incomplete:
+            report['incomplete sq'] = sorted_natural(incomplete)
+
+        header = "Every SAM file must have complete @SQ headers with SN and LN\n" \
+                 "fields when counting in non-genomic mode.\n"
+        self.report.add_error_section(header, report)
+
+    def generate_identifier_report(self, duplicate, ambiguous):
+        report = {}
+        if duplicate:
+            report['intra sq'] = sorted_natural(duplicate)
+        if ambiguous:
+            report['inter sq'] = sorted_natural(ambiguous)
+
+        header = "Sequence identifiers must be unique and have consistent length definitions.\n"
+        self.report.add_error_section(header, report)
+
+    def read_sq_headers(self):
+        for file in self.sam_files:
+            with open(file, 'rb') as f:
+                reader = SAM_reader()
+                reader._read_to_first_aln(f)
+
+            self.sq_headers[file] = reader._header_dict.get('@SQ', [])
\ No newline at end of file

From cbf0ab3fed2457c39928ec8b781aa70ee90dc383 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:09:35 -0800
Subject: [PATCH 11/21] Replacing load_gff_files() with load_references(). This
 is now where the reference parsing object is constructed, and also where
 references are validated. It just made sense for these to be among the
 initial operations when running tiny-count. The routine for reorganizing the
 YAML representation of gff_files is now part of the PathsFile class. I'm
 happy with how this has cleaned up the code.

---
 tiny/rna/configuration.py   | 71 +++++++++++++++++++++++++++++--------
 tiny/rna/counter/counter.py | 51 ++++++++++++--------------
 2 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
index 88f00a85..61cf919a 100644
--- a/tiny/rna/configuration.py
+++ b/tiny/rna/configuration.py
@@ -136,6 +136,22 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No
         else:
             return destination
 
+    @staticmethod
+    def is_path_dict(val, empty_ok=False):
+        return (isinstance(val, dict) and
+                isinstance(val.get("path"), (str, bytes)) and
+                (len(val['path']) or empty_ok))
+
+    @staticmethod
+    def is_path_str(val, empty_ok=False):
+        return (isinstance(val, (str, bytes)) and
+                (len(val) or empty_ok))
+
+    @classmethod
+    def is_path(cls, val, empty_ok=False):
+        return (cls.is_path_dict(val, empty_ok) or
+                cls.is_path_str(val, empty_ok))
+
     def setup_step_inputs(self):
         """For now, only tiny-plot requires additional setup for step inputs
         This function is called at both startup and resume"""
@@ -340,17 +356,15 @@ def get_bt_index_files(self):
     def validate_inputs(self):
         """For now, only GFF files are validated here"""
 
-        selection_rules = self.process_features_sheet()
-        gff_files = {gff['path']: [] for gff in self['gff_files']}
-        ebwt = self.paths['ebwt'] if not self['run_bowtie_build'] else None
+        gff_files = self.paths.get_gff_config()
 
-        GFFValidator(
-            gff_files,
-            selection_rules,
-            ebwt=ebwt,
-            genomes=self.paths['reference_genome_files'],
-            alignments=None  # Used in tiny-count standalone runs
-        ).validate()
+        if gff_files:
+            GFFValidator(
+                gff_files,
+                self.process_features_sheet(),
+                self.paths['ebwt'] if not self['run_bowtie_build'] else None,
+                self.paths['reference_genome_files']
+            ).validate()
 
     def execute_post_run_tasks(self, return_code):
         if self['run_bowtie_build']:
@@ -469,9 +483,9 @@ def from_pipeline(value):
         """When tiny-count runs as a pipeline step, all file inputs are
         sourced from the working directory regardless of original path."""
 
-        if isinstance(value, dict) and value.get("path") is not None:
+        if ConfigBase.is_path_dict(value, empty_ok=True):
             return dict(value, path=os.path.basename(value['path']))
-        elif isinstance(value, (str, bytes)):
+        elif ConfigBase.is_path_str(value, empty_ok=True):
             return os.path.basename(value)
         else:
             return value
@@ -497,7 +511,7 @@ def as_cwl_file_obj(self, key: str):
         elif key in self.single:
             return self.cwl_file(val)
         elif key in self.groups:
-            return [self.cwl_file(sub) for sub in val if sub]
+            return [self.cwl_file(sub) for sub in val if self.is_path(sub)]
         elif key in self.prefix:
             raise ValueError(f"The parameter {key} isn't meant to be a CWL file object.")
         else:
@@ -526,7 +540,7 @@ def validate_paths(self):
 
         for key in self.groups:
             for entry in self[key]:
-                if isinstance(entry, dict): entry = entry['path']
+                if isinstance(entry, dict): entry = entry.get('path')
                 if not entry: continue
                 assert os.path.isfile(entry), \
                     "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \
@@ -538,6 +552,35 @@ def check_backward_compatibility(self):
             "that you are using a Paths File from an earlier version of tinyRNA. Please " \
             "check the release notes and update your configuration files."
 
+    def get_gff_config(self) -> Dict[str, list]:
+        """Restructures GFF input info so that it can be more easily handled.
+        To be clear, the Paths File YAML could be structured to match the desired output,
+        but the current format was chosen because it's more readable with more forgiving syntax.
+
+        The YAML format is [{"path": "gff_path_1", "alias": [alias1, alias2, ...]}, { ... }, ...]
+        The output format is {"gff_path_1": [alias1, alias2, ...], ...}
+        """
+
+        gff_files = defaultdict(list)
+        id_filter = lambda alias: alias.lower() != 'id'
+
+        # Build dictionary of files and allowed aliases
+        for gff in self['gff_files']:
+            if not self.is_path_dict(gff): continue
+            path, aliases = gff['path'], gff.get('alias', ())
+            gff_files[path].extend(filter(id_filter, aliases))
+
+        # Remove duplicate aliases per file, keep order
+        for file, alias in gff_files.items():
+            gff_files[file] = sorted(set(alias), key=alias.index)
+
+        if not len(gff_files) and not self.in_pipeline:
+            print("No GFF files were specified in {selfname} so "
+                  "reads will be counted in non-genomic mode."
+                  .format(selfname=self.basename))
+
+        return dict(gff_files)
+
     # Override
     def append_to(self, key: str, val: Any):
         """Overrides method in the base class. This is necessary due to automatic
diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
index 1d4cbe41..6ee45ecc 100644
--- a/tiny/rna/counter/counter.py
+++ b/tiny/rna/counter/counter.py
@@ -3,17 +3,15 @@
 import multiprocessing as mp
 import traceback
 import argparse
-import shutil
 import sys
 import os
 
-from collections import defaultdict
-from typing import Tuple, List, Dict
-from pkg_resources import resource_filename
+from typing import List, Dict
 
-from tiny.rna.counter.validation import GFFValidator
+from tiny.rna.counter.validation import GFFValidator, SamSqValidator
 from tiny.rna.counter.features import Features, FeatureCounter
 from tiny.rna.counter.statistics import MergedStatsManager
+from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, AnnotationParsing
 from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict, get_timestamp, add_transparent_help
 from tiny.rna.configuration import CSVReader, PathsFile, get_templates
 
@@ -160,40 +158,35 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]:
     return rules
 
 
-def load_gff_files(paths: PathsFile, libraries: List[dict], rules: List[dict]) -> Dict[str, list]:
-    """Retrieves the appropriate file path and alias attributes for each GFF,
-    then validates
+def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> AnnotationParsing:
+    """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object
 
     Args:
-        paths: a loaded PathsFile with a populated gff_files parameter
+        paths: a PathsFile object that optionally contains a `gff_files` configuration
         libraries: libraries parsed from Samples Sheet, each as a dict with a 'File' key
         rules: selection rules parsed from Features Sheet
+        prefs: command line arguments to pass to the AnnotationParsing subclass
 
     Returns:
-        gff: a dict of GFF files with lists of alias attribute keys
+        references: an AnnotationParsing object, subclassed based on
+            the presence of GFF files in the Paths File
     """
 
-    gff_files = defaultdict(list)
-    ignore_alias = ["id"]
-
-    # Build dictionary of unique files and allowed aliases
-    for gff in paths['gff_files']:
-        if gff['path'] is not None:
-            gff_files[gff['path']].extend(
-                alias for alias in gff.get('alias', ())
-                if alias.lower() not in ignore_alias
-            )
-
-    # Remove duplicate aliases (per file), keep original order
-    for file, alias in gff_files.items():
-        gff_files[file] = sorted(set(alias), key=alias.index)
+    gff_files = paths.get_gff_config()
+    sam_files = [lib['File'] for lib in libraries]
 
     if gff_files:
-        # Prepare supporting file inputs for GFF validation
-        sam_files = [lib['File'] for lib in libraries]
         GFFValidator(gff_files, rules, alignments=sam_files).validate()
+        references = ReferenceTables(gff_files, **prefs)
+    else:
+        sq_validator = SamSqValidator(sam_files)
+
+        # Reuse sq_validator's parsing results to save time
+        sq_validator.validate()
+        sequences = sq_validator.reference_seqs
+        references = NonGenomicAnnotations(sequences, **prefs)
 
-    return gff_files
+    return references
 
 
 @report_execution_time("Counting and merging")
@@ -229,11 +222,11 @@ def main():
         paths = PathsFile(args['paths_file'], is_pipeline)
         libraries = load_samples(paths['samples_csv'], is_pipeline)
         selection = load_config(paths['features_csv'], is_pipeline)
-        gff_files = load_gff_files(paths, libraries, selection)
+        reference = load_references(paths, libraries, selection, args)
 
         # global for multiprocessing
         global counter
-        counter = FeatureCounter(gff_files, libraries, selection, **args)
+        counter = FeatureCounter(reference, selection, **args)
 
         # Assign and count features using multiprocessing and merge results
         merged_counts = map_and_reduce(libraries, paths, args)

From e8f417355207c795c887b6645beccde866fbfd23 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:16:44 -0800
Subject: [PATCH 12/21] Updating the FeatureCounter constructor to use the new
 AnnotationParsing approach

---
 tiny/rna/counter/features.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index 73cd540f..d85d02a3 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from typing import List, Tuple, Set, Dict, Mapping
 
-from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, SAM_reader
+from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceTables, NonGenomicAnnotations
 from .statistics import LibraryStats
 from .matching import *
 
@@ -27,19 +27,19 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict
 
 class FeatureCounter:
 
-    def __init__(self, gff_file_set, sam_file_set, selection_rules, **prefs):
+    def __init__(self, references, selection_rules, **prefs):
         self.stats = LibraryStats(**prefs)
         self.sam_reader = SAM_reader(**prefs)
         self.selector = FeatureSelector(selection_rules, self.stats, **prefs)
 
-        if gff_file_set:
+        if isinstance(references, ReferenceTables):
             self.mode = "genomic"
-            annotations = ReferenceTables(gff_file_set, self.selector, **prefs)
-        else:
+        elif isinstance(references, NonGenomicAnnotations):
             self.mode = "non-genomic"
-            annotations = NonGenomicAnnotations(sam_file_set, self.selector, **prefs)
+        else:
+            raise TypeError("Expected ReferenceTables or NonGenomicAnnotations, got %s" % type(references))
 
-        Features(*annotations.get())
+        Features(*references.get(self.selector))
         self.prefs = prefs
 
     def count_reads(self, library: dict):

From 8e15f705db0f44deb9c655ab8907b91711e63b18 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:30:09 -0800
Subject: [PATCH 13/21] Misc. cleanup in configuration.py and resume.py: -
 Moving paths_config/paths_file bookkeeping out of ConfigBase. This is used by
 Configuration and Resume* classes, but not by PathsFile which also subclasses
 it. It is also better documented and consolidated in one function to make it
 clearer - Streamlining path object type checking with is_path_dict(),
 is_path_str(), and is_path() - The notice about "no GFF files provided" has
 been moved to get_gff_config() - Comment improvements

---
 tiny/rna/configuration.py | 54 +++++++++++++++++++--------------------
 tiny/rna/resume.py        |  8 +++---
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
index 61cf919a..08eac45c 100644
--- a/tiny/rna/configuration.py
+++ b/tiny/rna/configuration.py
@@ -5,11 +5,10 @@
 import sys
 import csv
 import os
-import re
 
 from pkg_resources import resource_filename
 from collections import Counter, OrderedDict, defaultdict
-from typing import Union, Any, Optional, List
+from typing import Union, Any, List, Dict
 from glob import glob
 
 from tiny.rna.counter.validation import GFFValidator
@@ -57,7 +56,10 @@ def set(self, key: str, val: Union[str, list, dict, bool]) -> Union[str, list, d
         return val
 
     def set_if_not(self, key: str, val: Union[str, list, dict, bool]) -> Any:
-        """Apply the setting if it has not been previously set"""
+        """Apply the setting if it has not been previously set. This differs from
+        dict.setdefault() in that existing keys are overwritten if the associated
+        value is false in boolean context (e.g. None, False, empty container, etc.)"""
+
         if not self[key]:
             self[key] = val
             return val
@@ -125,13 +127,9 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No
         if isinstance(origin, dict): origin = origin.get('path')
         if origin is None: origin = self.dir
 
-        if (
-                isinstance(destination, dict) and
-                isinstance(destination.get("path"), (str, bytes)) and
-                len(destination['path'])
-        ):
+        if self.is_path_dict(destination):
             return dict(destination, path=self.joinpath(origin, destination["path"]))
-        elif isinstance(destination, (str, bytes)) and bool(destination):
+        elif self.is_path_str(destination):
             return self.joinpath(origin, destination)
         else:
             return destination
@@ -193,11 +191,6 @@ def write_processed_config(self, filename: str = None) -> str:
         if filename is None: filename = self.get_outfile_path(self.inf)
 
         with open(filename, 'w') as outconf:
-            if 'paths_config' in self and not os.path.isabs(self['paths_config']):
-                # Processed config will be written to the Run Directory
-                # Ensure paths_config is an absolute path so it remains valid
-                self['paths_config'] = self.from_here(self['paths_config'])
-
             self.yaml.dump(self.config, outconf)
 
         return filename
@@ -224,7 +217,7 @@ def __init__(self, config_file: str, validate_inputs=False):
         super().__init__(config_file)
 
         self.paths = self.load_paths_config()
-        self.assimilate_paths_file()
+        self.absorb_paths_file()
 
         self.setup_pipeline()
         self.setup_file_groups()
@@ -235,18 +228,24 @@ def __init__(self, config_file: str, validate_inputs=False):
         if validate_inputs: self.validate_inputs()
 
     def load_paths_config(self):
-        """Constructs a sub-configuration object containing workflow file preferences
-        self['paths_config'] is the user-facing file path (just the path string)
-        self['paths_file'] is a CWL file object used as a workflow input."""
-        paths_file_path = self.from_here(self['paths_config'])
-        return PathsFile(paths_file_path)
+        """Returns a PathsFile object and updates keys related to the Paths File path"""
+
+        # paths_config: user-specified
+        #   Resolve the absolute path so that it remains valid when
+        #   the processed Run Config is copied to the Run Directory
+        self['paths_config'] = self.from_here(self['paths_config'])
+
+        # paths_file: automatically generated
+        #   CWL file dictionary is used as a workflow input
+        self['paths_file'] = self.cwl_file(self['paths_config'])
+
+        return PathsFile(self['paths_config'])
 
-    def assimilate_paths_file(self):
+    def absorb_paths_file(self):
         for key in [*PathsFile.single, *PathsFile.groups]:
             self[key] = self.paths.as_cwl_file_obj(key)
         for key in PathsFile.prefix:
             self[key] = self.paths[key]
-        self['paths_file'] = self.cwl_file(self.paths.inf)
 
     def process_samples_sheet(self):
         samples_sheet_path = self.paths['samples_csv']
@@ -460,6 +459,12 @@ class PathsFile(ConfigBase):
         - Lookups that return list values do not return the original object; don't
           append to them. Instead, use the append_to() helper function.
         - Chained assignments can produce unexpected results.
+
+    Args:
+        file: The path to the Paths File, as a string
+        in_pipeline: True only when utilized by a step in the workflow,
+            in which case input files are sourced from the working directory
+            regardless of the path indicated in the Paths File
     """
 
     # Parameter types
@@ -522,11 +527,6 @@ def validate_paths(self):
             "The following parameters are required in {selfname}: {params}" \
             .format(selfname=self.basename, params=', '.join(self.required))
 
-        if not any(gff.get('path') for gff in self['gff_files']):
-            print("No GFF files were specified in {selfname} so "
-                  "reads will be counted in non-genomic mode."
-                  .format(selfname=self.basename))
-
         # Some entries in Paths File are omitted from tiny-count's working directory during
         #  pipeline runs. There is no advantage to checking file existence here vs. in load_*
         if self.in_pipeline: return
diff --git a/tiny/rna/resume.py b/tiny/rna/resume.py
index e3cd7840..8adcf083 100644
--- a/tiny/rna/resume.py
+++ b/tiny/rna/resume.py
@@ -91,9 +91,11 @@ def _create_truncated_workflow(self):
             wf_steps[self.steps[0]]['in'][param] = new_input['var']
 
     def load_paths_config(self):
-        """Constructs a sub-configuration object containing workflow file preferences"""
-        paths_file_path = self.from_here(self['paths_config'])
-        return PathsFile(paths_file_path)
+        """Returns a PathsFile object and updates keys related to the Paths File path"""
+
+        self['paths_config'] = self.from_here(self['paths_config'])
+        self['paths_file'] = self.cwl_file(self['paths_config'])
+        return PathsFile(self['paths_config'])
 
     def assimilate_paths_file(self):
         """Updates the processed workflow with resume-safe Paths File parameters"""

From 70783b2b34b0e28b339c1dec242201e2ddf0345b Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:30:54 -0800
Subject: [PATCH 14/21] Misc. cleanup in validation.py: empty results are more
 clearly indicated when printing reports

---
 tiny/rna/counter/validation.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index bf83f40a..714d6f99 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -61,14 +61,19 @@ def recursive_indent(self, mapping: dict, indent: str):
 
         lines = []
         for key, val in mapping.items():
-            if not val: continue
             key_header = f"{indent}{self.key_mapper.get(key, key)}: "
             if isinstance(val, dict):
                 lines.append(key_header)
-                lines.extend(self.recursive_indent(val, indent + '\t'))
+                if len(val):
+                    lines.extend(self.recursive_indent(val, indent + '\t'))
+                else:
+                    lines.append(indent + "\t" + "{ NONE }")
             elif isinstance(val, (list, set)):
                 lines.append(key_header)
-                lines.extend([indent + '\t' + line for line in map(str, val)])
+                if len(val):
+                    lines.extend([indent + '\t' + line for line in map(str, val)])
+                else:
+                    lines.append(indent + "\t" + "[ NONE ]")
             else:
                 lines.append(key_header + str(val))
         return lines

From ddf3bfeba096c3ad69e5fe9ec07c7b1ef7bba42e Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:33:57 -0800
Subject: [PATCH 15/21] Updating unit tests

---
 tests/unit_tests_configuration.py | 58 ++++++++++++++++++++++++++-----
 tests/unit_tests_counter.py       | 34 ------------------
 tests/unit_tests_hts_parsing.py   | 46 ++++++++++++------------
 3 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py
index 5eed377e..70c6fd37 100644
--- a/tests/unit_tests_configuration.py
+++ b/tests/unit_tests_configuration.py
@@ -286,18 +286,29 @@ def test_validate_required_parameters(self):
 
             config[key] = oldval
 
-    """Does PathsFile check that at least one GFF file has been provided?"""
+    """Does PathsFile notify the user of the change in counting style 
+    when no GFF files have been provided?"""
 
-    def test_validate_gff_files(self):
+    def test_no_gff_files(self):
         config = make_paths_file()
+        stdout = io.StringIO()
 
-        with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"):
-            config['gff_files'] = [{'path': "", 'alias': []}]
-            config.validate_paths()
+        gff_configs = [
+            [{'path': "", 'alias': []}],
+            [{'irrelevant': "value"}],
+            None,
+            []
+        ]
 
-        with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"):
-            config['gff_files'] = [{'irrelevant': "value"}]
-            config.validate_paths()
+        with contextlib.redirect_stdout(stdout):
+            for gff_config in gff_configs:
+                config['gff_files'] = gff_config
+                config.validate_paths()
+                config_return = config.get_gff_config()
+
+                self.assertRegex(stdout.getvalue(), r".*(No GFF files).*")
+                self.assertEqual(config_return, {})
+                stdout.truncate(0)
 
     """Does PathsFile check for missing files for single entry parameters?"""
 
@@ -350,6 +361,37 @@ def test_pipeline_mapping(self):
         self.assertEqual(config['empty_path'], "")
         self.assertEqual(config['none_path'], None)
 
+    """Does get_gff_config properly screen for "ID" alias attributes?"""
+
+    def test_get_gff_config_id_alias_attr(self):
+        config = make_paths_file()
+        config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}]
+
+        gff_files = config.get_gff_config()
+        expected = {"/irrelevant": []}
+
+        self.assertDictEqual(gff_files, expected)
+
+    """Does get_gff_config merge aliases if the same GFF file is listed more than once? 
+    Are duplicates also removed and original order preserved?"""
+
+    def test_get_gff_config_merge_alias_attr(self):
+        config = make_paths_file()
+        config['gff_files'] = [
+            {'path': "/same/path", 'alias': ['attr1', 'attr2']},
+            {'path': "/same/path", 'alias': ['attr1', 'attrN']},
+            {'path': "/different/path", 'alias': ['attr3']}
+        ]
+
+        gff_files = config.get_gff_config()
+
+        expected = {
+            "/same/path": ['attr1', 'attr2', 'attrN'],
+            "/different/path": ['attr3']
+        }
+
+        self.assertDictEqual(gff_files, expected)
+
 
 class ConfigurationTest(unittest.TestCase):
 
diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py
index 0110ab50..20d9107c 100644
--- a/tests/unit_tests_counter.py
+++ b/tests/unit_tests_counter.py
@@ -231,40 +231,6 @@ def test_load_config_rna_to_cDNA(self):
 
         self.assertEqual(ruleset[0]['nt5end'], 'T')
 
-    """Does load_gff_files properly screen for "ID" alias attributes?"""
-
-    def test_load_gff_files_id_alias_attr(self):
-        config = helpers.make_paths_file()
-        config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}]
-
-        # Patch GFFValidator so that it isn't called (not relevant)
-        with patch('tiny.rna.counter.counter.GFFValidator'):
-            gff_files = counter.load_gff_files(config, [], {})
-
-        expected = {"/irrelevant": []}
-        self.assertDictEqual(gff_files, expected)
-
-    """Does load_gff_files merge aliases if the same GFF file is listed more than once? 
-    Are duplicates also removed and original order preserved?"""
-
-    def test_load_gff_files_merge_alias_attr(self):
-        config = helpers.make_paths_file()
-        config['gff_files'] = [
-            {'path': "/same/path", 'alias': ['attr1', 'attr2']},
-            {'path': "/same/path", 'alias': ['attr1', 'attrN']},
-            {'path': "/different/path", 'alias': ['attr3']}
-        ]
-
-        # Patch GFFValidator so that it isn't called (not relevant)
-        with patch('tiny.rna.counter.counter.GFFValidator'):
-            gff_files = counter.load_gff_files(config, [], {})
-
-        expected = {
-            "/same/path": ['attr1', 'attr2', 'attrN'],
-            "/different/path": ['attr3']
-        }
-
-        self.assertDictEqual(gff_files, expected)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
index 9a19d6f8..f5aa9966 100644
--- a/tests/unit_tests_hts_parsing.py
+++ b/tests/unit_tests_hts_parsing.py
@@ -145,7 +145,7 @@ def test_ref_tables_single_feature(self):
         iv = HTSeq.GenomicInterval("I", 3746, 3909, "-")
         kwargs = {'all_features': True}
 
-        feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+        feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
         steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True))
 
         self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict))
@@ -169,7 +169,7 @@ def test_ref_tables_single_feature_all_features_false(self):
         iv = HTSeq.GenomicInterval("I", 3746, 3909, "-")
         kwargs = {'all_features': True}
 
-        feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+        feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
         steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True))
 
         self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict))
@@ -192,7 +192,7 @@ def test_ref_tables_missing_name_attribute_all_features_false(self):
         # ReferenceTables.get() skips the steps for recording the feature's alias.
         # Instead, a different exception is raised due to reference tables being empty
         with self.assertRaisesRegex(ValueError, expected_err):
-            ReferenceTables(feature_source, feature_selector, **kwargs).get()
+            ReferenceTables(feature_source, **kwargs).get(feature_selector)
 
     """Does ReferenceTables.get() raise ValueError when a feature lacks an ID attribute?"""
 
@@ -209,7 +209,7 @@ def test_ref_tables_missing_id_attribute(self):
 
         with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader):
             with self.assertRaisesRegex(ValueError, expected_err):
-                _ = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+                _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
 
     """Does ReferenceTables.get() properly concatenate aliases if there is more than one alias for a feature?"""
     """Does ReferenceTables.get() properly concatenate aliases when Name Attribute refers to a list-type alias?"""
@@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self):
 
         # Notice: screening for "ID" name attribute happens earlier in counter.load_config()
         expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")}
-        _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get()
+        _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([]))
 
         self.assertDictEqual(alias, expected_alias)
 
@@ -236,7 +236,7 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self):
 
         with self.assertRaisesRegex(ValueError, expected_err):
             # No aliases saved due to all_features=False and the lack of identity matches
-            _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get()
+            _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([]))
 
     """Does ReferenceTables.get() properly concatenate identity match tuples when multiple GFF files define
     matches for a feature?"""
@@ -262,7 +262,7 @@ def test_ref_tables_identity_matches_multisource_concat(self):
             set()
         ]
 
-        feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+        feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
 
         actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(actual_matches, expected_matches)
@@ -274,7 +274,7 @@ def test_ref_tables_discontinuous_aliases(self):
         feature_source = {f"{resources}/discontinuous.gff3": ["Name"]}
         mock_selector = self.selector_with_template(helpers.rules_template)
 
-        _, alias, _ = ReferenceTables(feature_source, mock_selector, **kwargs).get()
+        _, alias, _ = ReferenceTables(feature_source, **kwargs).get(mock_selector)
 
         # Ancestor depth of 1, distinct aliases
         self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name'))
@@ -294,7 +294,7 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self):
                        "This may be due to a lack of features matching 'Select for...with value...'"
 
         with self.assertRaisesRegex(ValueError, expected_err):
-            ReferenceTables(feature_source, mock_selector, **kwargs).get()
+            ReferenceTables(feature_source, **kwargs).get(mock_selector)
 
     """Does ReferenceTables.get() properly build a GenomicArrayOfSets for discontinuous features
         where no features match but all_features is True? If the feature didn't match we only want to
@@ -311,7 +311,7 @@ def test_ref_tables_discontinuous_features(self):
         # EVEN if all_features = True.
         expected = [set()]
 
-        feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+        feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
         actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True))
         self.assertListEqual(actual, expected)
 
@@ -331,8 +331,8 @@ def test_ref_tables_discontinuous_intervals(self):
         sib_2 = HTSeq.GenomicInterval('I', 110, 120, '-')
         sib_3 = HTSeq.GenomicInterval('I', 139, 150, '-')
 
-        RT_instance = ReferenceTables(feature_source, feature_selector, **kwargs)
-        _ = RT_instance.get()
+        RT_instance = ReferenceTables(feature_source, **kwargs)
+        _ = RT_instance.get(feature_selector)
 
         # Ancestor depth of 1
         self.assertEqual(RT_instance.intervals['GrandParent'], [grandparent_iv, parent_w_p_iv, child_w_gp_iv])
@@ -381,7 +381,7 @@ def test_ref_tables_discontinuous_identity_matches(self):
                     {(Sibling,     False, (rule3_sib['139:150'], rule2_sib['139:150']))},
                     set()]
 
-        feats, _, _ = ReferenceTables(feature_source, feature_selector, **rt_kwargs).get()
+        feats, _, _ = ReferenceTables(feature_source, **rt_kwargs).get(feature_selector)
         actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True))
         self.assertListEqual(actual_steps, expected)
 
@@ -401,8 +401,8 @@ def test_ref_tables_source_filter(self):
         exp_filtered =  {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"}
         exp_parents =   {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}
 
-        rt = ReferenceTables(feature_source, feature_selector, **kwargs)
-        feats, alias, _ = rt.get()
+        rt = ReferenceTables(feature_source, **kwargs)
+        feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(alias, exp_alias)
         self.assertEqual(rt.parents, exp_parents)
@@ -426,8 +426,8 @@ def test_ref_tables_type_filter(self):
         exp_filtered =  {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"}
         exp_parents =   {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}
 
-        rt = ReferenceTables(feature_source, feature_selector, **kwargs)
-        feats, alias, _ = rt.get()
+        rt = ReferenceTables(feature_source, **kwargs)
+        feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(alias, exp_alias)
         self.assertEqual(rt.intervals, exp_intervals)
@@ -443,8 +443,8 @@ def test_ref_tables_both_filter(self):
         feature_source = {f"{resources}/discontinuous.gff3": ["Name"]}
         feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}])
 
-        rt = ReferenceTables(feature_source, feature_selector, **kwargs)
-        feats, alias, _ = rt.get()
+        rt = ReferenceTables(feature_source, **kwargs)
+        feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(rt.filtered, {'Child1', 'Child2'})
         self.assertEqual(rt.parents, {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'})
@@ -473,7 +473,7 @@ def test_ref_tables_tagged_match_single(self):
             set()
         ]
 
-        feats, aliases, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get()
+        feats, aliases, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
 
         actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(actual_feats, expected_feats)
@@ -509,7 +509,7 @@ def test_ref_tables_tagged_match_merging(self):
             set()
         ]
 
-        feats, aliases, tags = ReferenceTables(feature_source, feature_selector).get()
+        feats, aliases, tags = ReferenceTables(feature_source).get(feature_selector)
 
         stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(stepvec, expected_feats)
@@ -762,11 +762,11 @@ def test_gff_megazord(self):
         files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')}
 
         fs = FeatureSelector(rules, LibraryStats())
-        rt = ReferenceTables(files, fs)
+        rt = ReferenceTables(files)
 
         # The test is passed if this command
         # completes without throwing errors.
-        rt.get()
+        rt.get(fs)
 
 if __name__ == '__main__':
     unittest.main()

From cb1107696eecc30363df0539b4c2d160787e4454 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 17:34:30 -0800
Subject: [PATCH 16/21] Adding unit tests for SAM @SQ header validation

---
 tests/unit_tests_validation.py | 116 ++++++++++++++++++++++++++++++++-
 1 file changed, 114 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py
index a692c216..79226116 100644
--- a/tests/unit_tests_validation.py
+++ b/tests/unit_tests_validation.py
@@ -6,10 +6,10 @@
 from glob import glob
 from unittest.mock import patch, mock_open
 
-from rna.counter.validation import GFFValidator, ReportFormatter
+from rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator
 
 
-class ValidationTests(unittest.TestCase):
+class GFFValidatorTest(unittest.TestCase):
 
     """======== Helper functions =========================== """
 
@@ -295,6 +295,118 @@ def test_chrom_heuristics_runtime(self):
         self.assertLessEqual(end-start, 2)
 
 
+class SamSqValidatorTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \
+                             "fields when counting in non-genomic mode.\n"
+
+        self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n"
+    
+    def make_sam_sq_header(self, chroms):
+        return '\n'.join(
+            [f"@SQ\tSN:{chrom[0]}\tLN:{chrom[1]}"
+             for chrom in chroms]
+        )
+
+    def make_parsed_sq_header(self, chroms):
+        return [{'SN': chrom[0], 'LN': chrom[1]} if chrom[0] and chrom[1] else
+                {'SN': chrom[0]} if not chrom[1] else
+                {'LN': chrom[1]}
+                for chrom in chroms]
+
+    """Does SamSqValidator correctly identify missing @SQ headers?"""
+
+    def test_missing_sq_headers(self):
+        mock_files = ['sam1', 'sam2']
+        mock_sam_headers = {
+            mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]),
+            mock_files[1]: []
+        }
+
+        expected = '\n'.join([
+            self.syntax_header,
+            "\t" + f"{SamSqValidator.targets['missing sq']}: ",
+            "\t\t" + mock_files[1]
+        ])
+
+        validator = SamSqValidator(mock_files)
+        validator.sq_headers = mock_sam_headers
+        validator.validate_sq_headers()
+
+        self.assertListEqual(validator.report.errors, [expected])
+        self.assertListEqual(validator.report.warnings, [])
+
+    """Does SamSqValidator correctly identify incomplete @SQ headers?"""
+
+    def test_incomplete_sq_headers(self):
+        mock_files = ['sam1', 'sam2', 'sam3']
+        mock_sam_headers = {
+            mock_files[0]: self.make_parsed_sq_header([('chr1', 100),  ('chr2', 200)]),
+            mock_files[1]: self.make_parsed_sq_header([('chr3', None), ('chr4', 400)]),
+            mock_files[2]: self.make_parsed_sq_header([('chr5', 500),  (None,   600)])
+        }
+
+        expected = '\n'.join([
+            self.syntax_header,
+            "\t" + f"{SamSqValidator.targets['incomplete sq']}: ",
+            "\t\t" + mock_files[1],
+            "\t\t" + mock_files[2],
+        ])
+
+        validator = SamSqValidator(mock_files)
+        validator.sq_headers = mock_sam_headers
+        validator.validate_sq_headers()
+
+        self.assertListEqual(validator.report.errors, [expected])
+        self.assertListEqual(validator.report.warnings, [])
+
+    """Does SamSqValidator correctly identify duplicate identifiers?"""
+
+    def test_duplicate_identifiers(self):
+        mock_files = ['sam1', 'sam2', 'sam3']
+        mock_sam_headers = {
+            mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]),
+            mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]),
+            mock_files[2]: self.make_parsed_sq_header([('chr1', 100), ('chr1', 100)])
+        }
+
+        expected = '\n'.join([
+            self.identifier_header,
+            "\t" + f"{SamSqValidator.targets['intra sq']}: ",
+            "\t\t" + mock_files[2],
+        ])
+
+        validator = SamSqValidator(mock_files)
+        validator.sq_headers = mock_sam_headers
+        validator.validate_sq_headers()
+
+        self.assertListEqual(validator.report.errors, [expected])
+        self.assertListEqual(validator.report.warnings, [])
+
+    """Does SamSqValidator correctly identify identifiers with inconsistent length definitions?"""
+
+    def test_inconsistent_identifier_length(self):
+        mock_files = ['sam1', 'sam2', 'sam3']
+        mock_sam_headers = {
+            mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]),
+            mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 300)]),
+            mock_files[2]: self.make_parsed_sq_header([('chr1', 200), ('chr2', 100)])
+        }
+
+        expected = '\n'.join([
+            self.identifier_header,
+            "\t" + f"{SamSqValidator.targets['inter sq']}: ",
+            "\t\t" + 'chr1',
+            "\t\t" + 'chr2',
+        ])
+
+        validator = SamSqValidator(mock_files)
+        validator.sq_headers = mock_sam_headers
+        validator.validate_sq_headers()
+
+        self.assertListEqual(validator.report.errors, [expected])
+        self.assertListEqual(validator.report.warnings, [])
 
 if __name__ == '__main__':
     unittest.main()

From a0df42a0ff36058806bf684c03323efb84990410 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 18:04:20 -0800
Subject: [PATCH 17/21] Bugfix: empty classifier fields are parsed from
 Features Sheet as an empty string. NonGenomicAnnotations has to emulate this.

---
 tiny/rna/counter/hts_parsing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 3c5b2ea4..39d1c508 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -796,7 +796,7 @@ def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples):
 
         # Features are classified in Reference Tables (Stage 1 selection)
         # For compatibility, use the seq_id with an empty classifier (index 1)
-        tagged_id = (seq_id,)
+        tagged_id = (seq_id, '')
         self.tags[seq_id] = {tagged_id}
 
         for strand in ('+', '-'):

From 9a4e00e5c62a34441988eaa5a670acefd8afec79 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 19:19:48 -0800
Subject: [PATCH 18/21] Renaming classes so they follow a better pattern:

AnnotationParsing -> ReferenceBase
ReferenceTables -> ReferenceFeatures
NonGenomicAnnotations -> ReferenceSeqs

Also made some corrections to comments:
- Uses of ReferenceTables that also apply to ReferenceSeqs have been changed to "reference parsers" for the most part
- A few spelling mistakes
---
 tests/unit_tests_hts_parsing.py | 72 ++++++++++++++++-----------------
 tests/unit_tests_stepvector.py  |  2 +-
 tiny/rna/counter/counter.py     | 16 ++++----
 tiny/rna/counter/features.py    | 12 +++---
 tiny/rna/counter/hts_parsing.py | 15 ++++---
 tiny/rna/counter/validation.py  |  8 ++--
 6 files changed, 64 insertions(+), 61 deletions(-)

diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
index e6c54213..74de20d7 100644
--- a/tests/unit_tests_hts_parsing.py
+++ b/tests/unit_tests_hts_parsing.py
@@ -130,7 +130,7 @@ def test_gff_attr_list_vals(self):
         self.assertTrue(all([len(val) == 1 for key, val in attr.items() if key != "Class"]))
         self.assertEqual(len(attr['Class']), 2)
 
-    """Does ReferenceTables.get() return the expected features, aliases, and classes for a single record GFF?"""
+    """Does ReferenceFeatures.get() return the expected features, aliases, and classes for a single record GFF?"""
 
     def test_ref_tables_single_feature(self):
         feature_source = {self.short_gff_file: ["sequence_name"]}
@@ -145,7 +145,7 @@ def test_ref_tables_single_feature(self):
         iv = HTSeq.GenomicInterval("I", 3746, 3909, "-")
         kwargs = {'all_features': True}
 
-        feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+        feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
         steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True))
 
         self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict))
@@ -169,7 +169,7 @@ def test_ref_tables_single_feature_all_features_false(self):
         iv = HTSeq.GenomicInterval("I", 3746, 3909, "-")
         kwargs = {'all_features': True}
 
-        feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+        feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
         steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True))
 
         self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict))
@@ -189,12 +189,12 @@ def test_ref_tables_missing_name_attribute_all_features_false(self):
                        "This may be due to a lack of features matching 'Select for...with value...'"
 
         # Since all_features is False and there are no identity matches, the main loop in
-        # ReferenceTables.get() skips the steps for recording the feature's alias.
+        # ReferenceFeatures.get() skips the steps for recording the feature's alias.
         # Instead, a different exception is raised due to reference tables being empty
         with self.assertRaisesRegex(ValueError, expected_err):
-            ReferenceTables(feature_source, **kwargs).get(feature_selector)
+            ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
 
-    """Does ReferenceTables.get() raise ValueError when a feature lacks an ID attribute?"""
+    """Does ReferenceFeatures.get() raise ValueError when a feature lacks an ID attribute?"""
 
     def test_ref_tables_missing_id_attribute(self):
         feature_source = {self.short_gff_file: ["ID"]}
@@ -209,10 +209,10 @@ def test_ref_tables_missing_id_attribute(self):
 
         with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader):
             with self.assertRaisesRegex(ValueError, expected_err):
-                _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+                _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
 
-    """Does ReferenceTables.get() properly concatenate aliases if there is more than one alias for a feature?"""
-    """Does ReferenceTables.get() properly concatenate aliases when Name Attribute refers to a list-type alias?"""
+    """Does ReferenceFeatures.get() properly concatenate aliases if there is more than one alias for a feature?"""
+    """Does ReferenceFeatures.get() properly concatenate aliases when Name Attribute refers to a list-type alias?"""
     # 2 for 1!
 
     def test_ref_tables_alias_multisource_concat(self):
@@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self):
 
         # Notice: screening for "ID" name attribute happens earlier in counter.load_config()
         expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")}
-        _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([]))
+        _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([]))
 
         self.assertDictEqual(alias, expected_alias)
 
@@ -236,9 +236,9 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self):
 
         with self.assertRaisesRegex(ValueError, expected_err):
             # No aliases saved due to all_features=False and the lack of identity matches
-            _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([]))
+            _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([]))
 
-    """Does ReferenceTables.get() properly concatenate identity match tuples when multiple GFF files define
+    """Does ReferenceFeatures.get() properly concatenate identity match tuples when multiple GFF files define
     matches for a feature?"""
 
     def test_ref_tables_identity_matches_multisource_concat(self):
@@ -262,19 +262,19 @@ def test_ref_tables_identity_matches_multisource_concat(self):
             set()
         ]
 
-        feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+        feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
 
         actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(actual_matches, expected_matches)
 
-    """Does ReferenceTables.get() properly handle aliases for discontinuous features?"""
+    """Does ReferenceFeatures.get() properly handle aliases for discontinuous features?"""
 
     def test_ref_tables_discontinuous_aliases(self):
         kwargs = {'all_features': True}
         feature_source = {f"{resources}/discontinuous.gff3": ["Name"]}
         mock_selector = self.selector_with_template(helpers.rules_template)
 
-        _, alias, _ = ReferenceTables(feature_source, **kwargs).get(mock_selector)
+        _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(mock_selector)
 
         # Ancestor depth of 1, distinct aliases
         self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name'))
@@ -294,9 +294,9 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self):
                        "This may be due to a lack of features matching 'Select for...with value...'"
 
         with self.assertRaisesRegex(ValueError, expected_err):
-            ReferenceTables(feature_source, **kwargs).get(mock_selector)
+            ReferenceFeatures(feature_source, **kwargs).get(mock_selector)
 
-    """Does ReferenceTables.get() properly build a GenomicArrayOfSets for discontinuous features
+    """Does ReferenceFeatures.get() properly build a GenomicArrayOfSets for discontinuous features
         where no features match but all_features is True? If the feature didn't match we only want to
         display it in the Feature ID column of counts table with 0 counts. We DON'T want it to pop
         up as a candidate in Stage 2 selection."""
@@ -311,11 +311,11 @@ def test_ref_tables_discontinuous_features(self):
         # EVEN if all_features = True.
         expected = [set()]
 
-        feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+        feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
         actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True))
         self.assertListEqual(actual, expected)
 
-    """Does ReferenceTables.get() properly handle intervals for discontinous features?"""
+    """Does ReferenceFeatures.get() properly handle intervals for discontinous features?"""
 
     def test_ref_tables_discontinuous_intervals(self):
         kwargs = {'all_features': True}
@@ -331,7 +331,7 @@ def test_ref_tables_discontinuous_intervals(self):
         sib_2 = HTSeq.GenomicInterval('I', 110, 120, '-')
         sib_3 = HTSeq.GenomicInterval('I', 139, 150, '-')
 
-        RT_instance = ReferenceTables(feature_source, **kwargs)
+        RT_instance = ReferenceFeatures(feature_source, **kwargs)
         _ = RT_instance.get(feature_selector)
 
         # Ancestor depth of 1
@@ -341,7 +341,7 @@ def test_ref_tables_discontinuous_intervals(self):
         # Siblings
         self.assertEqual(RT_instance.intervals['Sibling'], [sib_1, sib_2, sib_3])
 
-    """Does ReferenceTables.get() properly merge identity matches of discontinuous features with the root feature?
+    """Does ReferenceFeatures.get() properly merge identity matches of discontinuous features with the root feature?
     Identity match tuples now also contain the corresponding rule's IntervalSelector, so extra bookkeeping must be
     performed for intervals in this test."""
 
@@ -381,11 +381,11 @@ def test_ref_tables_discontinuous_identity_matches(self):
                     {(Sibling,     False, (rule3_sib['139:150'], rule2_sib['139:150']))},
                     set()]
 
-        feats, _, _ = ReferenceTables(feature_source, **rt_kwargs).get(feature_selector)
+        feats, _, _ = ReferenceFeatures(feature_source, **rt_kwargs).get(feature_selector)
         actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True))
         self.assertListEqual(actual_steps, expected)
 
-    """Does ReferenceTables.get() properly handle source filters for discontinuous features?"""
+    """Does ReferenceFeatures.get() properly handle source filters for discontinuous features?"""
 
     def test_ref_tables_source_filter(self):
 
@@ -401,7 +401,7 @@ def test_ref_tables_source_filter(self):
         exp_filtered =  {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"}
         exp_parents =   {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}
 
-        rt = ReferenceTables(feature_source, **kwargs)
+        rt = ReferenceFeatures(feature_source, **kwargs)
         feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(alias, exp_alias)
@@ -410,7 +410,7 @@ def test_ref_tables_source_filter(self):
         self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats)
         self.assertDictEqual(rt.intervals, exp_intervals)
 
-    """Does ReferenceTables.get() properly handle type filters for discontinuous features?"""
+    """Does ReferenceFeatures.get() properly handle type filters for discontinuous features?"""
 
     def test_ref_tables_type_filter(self):
 
@@ -426,7 +426,7 @@ def test_ref_tables_type_filter(self):
         exp_filtered =  {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"}
         exp_parents =   {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}
 
-        rt = ReferenceTables(feature_source, **kwargs)
+        rt = ReferenceFeatures(feature_source, **kwargs)
         feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(alias, exp_alias)
@@ -435,7 +435,7 @@ def test_ref_tables_type_filter(self):
         self.assertEqual(rt.filtered, exp_filtered)
         self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats)
 
-    """Does ReferenceTables.get() properly handle both source and type filters for discontinuous features?"""
+    """Does ReferenceFeatures.get() properly handle both source and type filters for discontinuous features?"""
 
     def test_ref_tables_both_filter(self):
 
@@ -443,7 +443,7 @@ def test_ref_tables_both_filter(self):
         feature_source = {f"{resources}/discontinuous.gff3": ["Name"]}
         feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}])
 
-        rt = ReferenceTables(feature_source, **kwargs)
+        rt = ReferenceFeatures(feature_source, **kwargs)
         feats, alias, _ = rt.get(feature_selector)
 
         self.assertEqual(rt.filtered, {'Child1', 'Child2'})
@@ -451,7 +451,7 @@ def test_ref_tables_both_filter(self):
         self.assertEqual(list(alias.keys()), ['GrandParent', 'Parent2', 'Sibling'])
         self.assertEqual(len(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))), 9)
 
-    """Does ReferenceTables.get() maintain correct records for a single feature matching tagged rules?"""
+    """Does ReferenceFeatures.get() maintain correct records for a single feature matching tagged rules?"""
 
     def test_ref_tables_tagged_match_single(self):
         kwargs = {'all_features': False}
@@ -473,14 +473,14 @@ def test_ref_tables_tagged_match_single(self):
             set()
         ]
 
-        feats, aliases, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector)
+        feats, aliases, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector)
 
         actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(actual_feats, expected_feats)
         self.assertDictEqual(aliases, expected_aliases)
         self.assertDictEqual(tags, expected_tags)
 
-    """Does ReferenceTables.get() correctly merge records for discontinuous features matching multiple tagged rules?"""
+    """Does ReferenceFeatures.get() correctly merge records for discontinuous features matching multiple tagged rules?"""
 
     def test_ref_tables_tagged_match_merging(self):
         feature_source = {f"{resources}/discontinuous.gff3": ['Name']}
@@ -509,7 +509,7 @@ def test_ref_tables_tagged_match_merging(self):
             set()
         ]
 
-        feats, aliases, tags = ReferenceTables(feature_source).get(feature_selector)
+        feats, aliases, tags = ReferenceFeatures(feature_source).get(feature_selector)
 
         stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))
         self.assertListEqual(stepvec, expected_feats)
@@ -717,7 +717,7 @@ def test_CaseInsensitiveAttrs_contains_ident_wildcard(self):
 
 @unittest.skip("Long-running test, execute manually")
 class GenomeParsingTests(unittest.TestCase):
-    """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceTables class"""
+    """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceFeatures class"""
 
     @classmethod
     def setUpClass(self):
@@ -753,17 +753,17 @@ def setUpClass(self):
                         for chunk in r.iter_content(chunk_size=16384):
                             f.write(chunk)
 
-    """Does ReferenceTables.get() process all genomes without throwing any errors?"""
+    """Does ReferenceFeatures.get() process all genomes without throwing any errors?"""
     def test_gff_megazord(self):
         print("Running GFF Megazord test. This will take a long time...")
 
-        # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceTables
+        # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceFeatures
         rules = [{'Identity': ('', ''), 'Class': '', 'Filter_s': '', 'Filter_t': '', 'Hierarchy': 0,
                   'Overlap': 'partial', 'Strand': '', 'nt5end': '', 'Length': ''}]
         files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')}
 
         fs = FeatureSelector(rules, LibraryStats())
-        rt = ReferenceTables(files)
+        rt = ReferenceFeatures(files)
 
         # The test is passed if this command
         # completes without throwing errors.
diff --git a/tests/unit_tests_stepvector.py b/tests/unit_tests_stepvector.py
index c72facb5..0487868f 100644
--- a/tests/unit_tests_stepvector.py
+++ b/tests/unit_tests_stepvector.py
@@ -10,7 +10,7 @@ class StepVectorTests(unittest.TestCase):
 
     def test_genomicarray_with_cython_stepvec(self):
         # Patch the StepVector reference in the HTSeq module and use a GenomicArray
-        # instead of a GenomicArrayOfSets, just as we do in ReferenceTables
+        # instead of a GenomicArrayOfSets, just as we do in reference parsers
         setattr(HTSeq.StepVector, 'StepVector', StepVector)
         gas = HTSeq.GenomicArray('auto', stranded=False)
 
diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
index 6ee45ecc..6f402842 100644
--- a/tiny/rna/counter/counter.py
+++ b/tiny/rna/counter/counter.py
@@ -11,7 +11,7 @@
 from tiny.rna.counter.validation import GFFValidator, SamSqValidator
 from tiny.rna.counter.features import Features, FeatureCounter
 from tiny.rna.counter.statistics import MergedStatsManager
-from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, AnnotationParsing
+from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, ReferenceBase
 from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict, get_timestamp, add_transparent_help
 from tiny.rna.configuration import CSVReader, PathsFile, get_templates
 
@@ -132,7 +132,7 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str:
 
 
 def load_config(features_csv: str, is_pipeline: bool) -> List[dict]:
-    """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables
+    """Parses the Features Sheet to provide inputs to FeatureSelector and reference parsers
 
     Args:
         features_csv: a csv file which defines feature sources and selection rules
@@ -150,7 +150,7 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]:
         rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'})  # Convert RNA base to cDNA base
         rule['Identity'] = (rule.pop('Key'), rule.pop('Value'))             # Create identity tuple
         rule['Hierarchy'] = int(rule['Hierarchy'])                          # Convert hierarchy to number
-        rule['Overlap'] = rule['Overlap'].lower()                           # Built later in ReferenceTables
+        rule['Overlap'] = rule['Overlap'].lower()                           # Built later in reference parsers
 
         # Duplicate rule entries are not allowed
         if rule not in rules: rules.append(rule)
@@ -158,17 +158,17 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]:
     return rules
 
 
-def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> AnnotationParsing:
+def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> ReferenceBase:
     """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object
 
     Args:
         paths: a PathsFile object that optionally contains a `gff_files` configuration
         libraries: libraries parsed from Samples Sheet, each as a dict with a 'File' key
         rules: selection rules parsed from Features Sheet
-        prefs: command line arguments to pass to the AnnotationParsing subclass
+        prefs: command line arguments to pass to the ReferenceBase subclass
 
     Returns:
-        references: an AnnotationParsing object, subclassed based on
+        references: a ReferenceBase object, subclassed based on
             the presence of GFF files in the Paths File
     """
 
@@ -177,14 +177,14 @@ def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict],
 
     if gff_files:
         GFFValidator(gff_files, rules, alignments=sam_files).validate()
-        references = ReferenceTables(gff_files, **prefs)
+        references = ReferenceFeatures(gff_files, **prefs)
     else:
         sq_validator = SamSqValidator(sam_files)
 
         # Reuse sq_validator's parsing results to save time
         sq_validator.validate()
         sequences = sq_validator.reference_seqs
-        references = NonGenomicAnnotations(sequences, **prefs)
+        references = ReferenceSeqs(sequences, **prefs)
 
     return references
 
diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index d85d02a3..42b0d1d4 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from typing import List, Tuple, Set, Dict, Mapping
 
-from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceTables, NonGenomicAnnotations
+from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceFeatures, ReferenceSeqs
 from .statistics import LibraryStats
 from .matching import *
 
@@ -32,12 +32,12 @@ def __init__(self, references, selection_rules, **prefs):
         self.sam_reader = SAM_reader(**prefs)
         self.selector = FeatureSelector(selection_rules, self.stats, **prefs)
 
-        if isinstance(references, ReferenceTables):
+        if isinstance(references, ReferenceFeatures):
             self.mode = "genomic"
-        elif isinstance(references, NonGenomicAnnotations):
+        elif isinstance(references, ReferenceSeqs):
             self.mode = "non-genomic"
         else:
-            raise TypeError("Expected ReferenceTables or NonGenomicAnnotations, got %s" % type(references))
+            raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references))
 
         Features(*references.get(self.selector))
         self.prefs = prefs
@@ -84,7 +84,7 @@ class FeatureSelector:
     Two sources of data serve as targets for selection: feature attributes (sourced from
     input GFF files), and sequence attributes (sourced from input SAM files).
     All candidate features are assumed to have matched at least one Identity selector,
-    as determined by hts_parsing.ReferenceTables.get_matches_and_classes()
+    as determined by hts_parsing.ReferenceFeatures.get_matches_and_classes()
 
     The first round of selection was performed during GFF parsing.
 
@@ -205,7 +205,7 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb
 
         Unlike build_selectors() and build_inverted_identities(), this function
         is not called at construction time. Instead, it is called when finalizing
-        match-tuples in ReferenceTables. This is because interval selectors are
+        match-tuples in reference parsers. This is because interval selectors are
         created for each feature (requiring start/stop/strand to be known) for
         each of the feature's identity matches (each match-tuple).
 
diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 39d1c508..797f7f69 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -236,7 +236,7 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str:
 
     Args:
         sam_file: the path of the SAM file to evaluate
-        intervals: the intervals instance attribute of ReferenceTables, populated by .get()
+        intervals: the intervals instance attribute of ReferenceFeatures, populated by .get()
     """
 
     unstranded = HTSeq.GenomicArrayOfSets("auto", stranded=False)
@@ -419,7 +419,9 @@ def not_implemented(self):
         raise NotImplementedError(f"CaseInsensitiveAttrs does not support {stack()[1].function}")
 
 
-class AnnotationParsing(ABC):
+class ReferenceBase(ABC):
+    """The base class for reference parsers"""
+
     def __init__(self, **prefs):
         self.stepvector = prefs.get('stepvector', 'HTSeq')
         self.feats = self._init_genomic_array()
@@ -501,7 +503,8 @@ def parse_gff(file, row_fn: Callable, alias_keys=None):
 TagTable = DefaultDict[str, Set[Tuple[str, str]]]
 GenomicArray = HTSeq.GenomicArrayOfSets
 
-class ReferenceTables(AnnotationParsing):
+
+class ReferenceFeatures(ReferenceBase):
     """A GFF parser which builds feature, alias, and class reference tables
 
     Discontinuous features are supported, and comma-separated attribute values (GFF3 column 9)
@@ -514,7 +517,7 @@ class ReferenceTables(AnnotationParsing):
     Children of the root ancestor are otherwise not stored in the reference tables.
 
     Match-tuples are created for each Features Sheet rule which matches a feature's attributes.
-    They are structured as (rank, rule, overlap). "Rank" is the heirarchy value of the matching
+    They are structured as (rank, rule, overlap). "Rank" is the hierarchy value of the matching
     rule, "rule" is the index of that rule in FeatureSelector's rules table, and "overlap" is the
     IntervalSelector per the rule's overlap requirements.
 
@@ -582,7 +585,7 @@ def get_root_feature(self, lineage: list) -> str:
         """Returns the highest feature ID in the ancestral tree which passed stage 1 selection.
         The original feature ID is returned if there are no valid ancestors."""
 
-        # Descend tree until the descendent is found in the matches table
+        # Descend tree until the descendant is found in the matches table
         # This is because ancestor feature(s) may have been filtered
         for ancestor in lineage[::-1]:
             if self.was_matched(ancestor):
@@ -761,7 +764,7 @@ def column_filter_match(row, rule):
         return row.source in rule['Filter_s'] and row.type in rule['Filter_t']
 
 
-class NonGenomicAnnotations(AnnotationParsing):
+class ReferenceSeqs(ReferenceBase):
 
     def __init__(self, reference_seqs, **prefs):
         super().__init__(**prefs)
diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index 714d6f99..349fb615 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -6,7 +6,7 @@
 from collections import Counter, defaultdict
 from typing import List, Dict
 
-from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables, SAM_reader
+from tiny.rna.counter.hts_parsing import parse_gff, ReferenceFeatures, SAM_reader
 from tiny.rna.counter.features import FeatureSelector
 from tiny.rna.util import sorted_natural, gzip_open, report_execution_time
 
@@ -92,7 +92,7 @@ class GFFValidator:
     }
 
     def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None):
-        self.ReferenceTables = ReferenceTables(gff_files)
+        self.ReferenceFeatures = ReferenceFeatures(gff_files)
         self.column_filters = self.build_column_filters(rules)
         self.report = ReportFormatter(self.targets)
         self.chrom_set = set()
@@ -120,13 +120,13 @@ def parse_and_validate_gffs(self, file_set):
     def validate_gff_row(self, row, issues, file):
         # Skip definitions of whole chromosomes and obey source/type filters
         if row.type.lower() == "chromosome": return
-        if not self.ReferenceTables.column_filter_match(row, self.column_filters): return
+        if not self.ReferenceFeatures.column_filter_match(row, self.column_filters): return
 
         if row.iv.strand not in ('+', '-'):
             issues['strand'][file] += 1
 
         try:
-            self.ReferenceTables.get_feature_id(row)
+            self.ReferenceFeatures.get_feature_id(row)
         except:
             issues['ID attribute'][file] += 1
 

From 3247ea63814dd29aeae16bc49d7d0cedfbf339ad Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 19:23:06 -0800
Subject: [PATCH 19/21] This commit only changes the order in which methods are
 defined in ReferenceFeatures. No code changes and no functional changes.

The previous order was mostly arbitrary and it has been bothering me for a while. The new order mostly prioritizes core functions/concepts and the rest is roughly by calling order. This should make it a little easier to browse.
---
 tiny/rna/counter/hts_parsing.py | 122 ++++++++++++++++----------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 797f7f69..b8e4ba73 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -581,31 +581,37 @@ def parse_row(self, row, alias_keys=None):
         # Add alias to root ancestor if it is unique
         self.add_alias(root_id, alias_keys, row.attr)
 
-    def get_root_feature(self, lineage: list) -> str:
-        """Returns the highest feature ID in the ancestral tree which passed stage 1 selection.
-        The original feature ID is returned if there are no valid ancestors."""
+    @staticmethod
+    def get_feature_id(row):
+        id_collection = row.attr.get('ID') \
+                        or row.attr.get('gene_id') \
+                        or row.attr.get('Parent')
 
-        # Descend tree until the descendant is found in the matches table
-        # This is because ancestor feature(s) may have been filtered
-        for ancestor in lineage[::-1]:
-            if self.was_matched(ancestor):
-                return ancestor
-        else:
-            return lineage[0]  # Default: the original feature_id
+        if id_collection is None:
+            raise ValueError(f"Feature {row.name} does not have an ID attribute.")
+        if len(id_collection) == 0:
+            raise ValueError("A feature's ID attribute cannot be empty. This value is required.")
+        if len(id_collection) > 1:
+            return ','.join(id_collection)
 
-    def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs):
-        if "Parent" not in row_attrs:
-            return [feature_id]
+        return id_collection[0]
 
-        parent_id = self.get_row_parent(feature_id, row_attrs)
-        lineage = [feature_id, parent_id]
+    def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict:
+        """Performs Stage 1 selection and returns match tuples under their associated classifier"""
 
-        # Climb ancestral tree until the root parent is found
-        while parent_id in self.parents:
-            parent_id = self.parents[parent_id]
-            lineage.append(parent_id)
+        identity_matches = defaultdict(set)
+        for ident, rule_indexes in self.selector.inv_ident.items():
+            if row.attr.contains_ident(ident):
+                for index in rule_indexes:
+                    rule = self.selector.rules_table[index]
+                    if self.column_filter_match(row, rule):
+                        # Unclassified matches are pooled under '' empty string
+                        identity_matches[rule['Class']].add(
+                            (index, rule['Hierarchy'], rule['Overlap'])
+                        )
 
-        return lineage
+        # -> identity_matches: {classifier: {(rule, rank, overlap), ...}, ...}
+        return identity_matches
 
     def add_feature(self, feature_id: str, row, matches: defaultdict) -> str:
         """Adds the feature to the intervals table under its root ID, and to the matches table
@@ -630,29 +636,38 @@ def add_feature(self, feature_id: str, row, matches: defaultdict) -> str:
 
         return root_id
 
-    def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None:
-        """Add feature's aliases to the root ancestor's alias set"""
+    def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs):
+        if "Parent" not in row_attrs:
+            return [feature_id]
 
-        for alias_key in alias_keys:
-            for row_val in row_attrs.get(alias_key, ()):
-                self.alias[root_id].add(row_val)
+        parent_id = self.get_row_parent(feature_id, row_attrs)
+        lineage = [feature_id, parent_id]
 
-    def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict:
-        """Performs Stage 1 selection and returns match tuples under their associated classifier"""
+        # Climb ancestral tree until the root parent is found
+        while parent_id in self.parents:
+            parent_id = self.parents[parent_id]
+            lineage.append(parent_id)
 
-        identity_matches = defaultdict(set)
-        for ident, rule_indexes in self.selector.inv_ident.items():
-            if row.attr.contains_ident(ident):
-                for index in rule_indexes:
-                    rule = self.selector.rules_table[index]
-                    if self.column_filter_match(row, rule):
-                        # Unclassified matches are pooled under '' empty string
-                        identity_matches[rule['Class']].add(
-                            (index, rule['Hierarchy'], rule['Overlap'])
-                        )
+        return lineage
 
-        # -> identity_matches: {class: (rule, rank, overlap), ...}
-        return identity_matches
+    def get_root_feature(self, lineage: list) -> str:
+        """Returns the highest feature ID in the ancestral tree which passed stage 1 selection.
+        The original feature ID is returned if there are no valid ancestors."""
+
+        # Descend tree until the descendant is found in the matches table
+        # This is because ancestor feature(s) may have been filtered
+        for ancestor in lineage[::-1]:
+            if self.was_matched(ancestor):
+                return ancestor
+        else:
+            return lineage[0]  # Default: the original feature_id
+
+    def was_matched(self, untagged_id):
+        """Checks if the feature ID previously matched on identity, regardless of whether
+        the matching rule was tagged or untagged."""
+
+        # any() will short circuit on first match when provided a generator function
+        return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ()))
 
     def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str:
         """Get the current feature's parent while cooperating with filtered features"""
@@ -687,6 +702,13 @@ def exclude_row(self, row):
             self.parents[feature_id] = self.get_row_parent(feature_id, row.attr)
         self.chrom_vector_setdefault(row.iv.chrom)
 
+    def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None:
+        """Add feature's aliases to the root ancestor's alias set"""
+
+        for alias_key in alias_keys:
+            for row_val in row_attrs.get(alias_key, ()):
+                self.alias[root_id].add(row_val)
+
     def _finalize_aliases(self):
         self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()}
 
@@ -734,28 +756,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval])
 
         return merged_ivs
 
-    def was_matched(self, untagged_id):
-        """Checks if the feature ID previously matched on identity, regardless of whether
-        the matching rule was tagged or untagged."""
-
-        # any() will short circuit on first match when provided a generator function
-        return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ()))
-
-    @staticmethod
-    def get_feature_id(row):
-        id_collection = row.attr.get('ID') \
-                        or row.attr.get('gene_id') \
-                        or row.attr.get('Parent')
-
-        if id_collection is None:
-            raise ValueError(f"Feature {row.name} does not have an ID attribute.")
-        if len(id_collection) == 0:
-            raise ValueError("A feature's ID attribute cannot be empty. This value is required.")
-        if len(id_collection) > 1:
-            return ','.join(id_collection)
-
-        return id_collection[0]
-
     @staticmethod
     def column_filter_match(row, rule):
         """Checks if the GFF row passes the inclusive filter(s)

From ff79c98ef4496a60434cd27ec2c707388f829ca9 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Fri, 3 Feb 2023 19:23:35 -0800
Subject: [PATCH 20/21] Updating comments in the Paths File to indicate that
 gff_files is optional

---
 START_HERE/paths.yml                  | 2 +-
 tests/testdata/config_files/paths.yml | 2 +-
 tiny/templates/paths.yml              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml
index 1bda5099..782f2fb3 100644
--- a/START_HERE/paths.yml
+++ b/START_HERE/paths.yml
@@ -7,7 +7,7 @@
 #   1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv]
 #   2. Fill out the Features Sheet with selection rules [features.csv]
 #   3. Set samples_csv and features_csv (below) to point to these files
-#   4. Add annotation files and per-file alias preferences to gff_files
+#   4. Add annotation files and per-file alias preferences to gff_files (optional)
 #
 ######-------------------------------------------------------------------------------######
 
diff --git a/tests/testdata/config_files/paths.yml b/tests/testdata/config_files/paths.yml
index 125e6a76..92cc0da4 100644
--- a/tests/testdata/config_files/paths.yml
+++ b/tests/testdata/config_files/paths.yml
@@ -7,7 +7,7 @@
 #   1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv]
 #   2. Fill out the Features Sheet with selection rules [features.csv]
 #   3. Set samples_csv and features_csv (below) to point to these files
-#   4. Add annotation files and per-file alias preferences to gff_files
+#   4. Add annotation files and per-file alias preferences to gff_files (optional)
 #
 ######-------------------------------------------------------------------------------######
 
diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml
index 3f7a332d..2b70aba2 100644
--- a/tiny/templates/paths.yml
+++ b/tiny/templates/paths.yml
@@ -7,7 +7,7 @@
 #   1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv]
 #   2. Fill out the Features Sheet with selection rules [features.csv]
 #   3. Set samples_csv and features_csv (below) to point to these files
-#   4. Add annotation files and per-file alias preferences to gff_files
+#   4. Add annotation files and per-file alias preferences to gff_files (optional)
 #
 ######-------------------------------------------------------------------------------######
 

From ba9762872eb3904ce9c37eafa8183a2583f2e412 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Mon, 6 Feb 2023 16:08:25 -0800
Subject: [PATCH 21/21] Adding a timeout to the sequence-based counting notice
 that is issued when users don't have GFF files listed.

Also correcting instances of "non-genomic" to "sequence-based"
---
 tests/unit_tests_validation.py  |  2 +-
 tiny/rna/configuration.py       | 15 ++++++++++++---
 tiny/rna/counter/features.py    |  4 ++--
 tiny/rna/counter/hts_parsing.py |  3 +--
 tiny/rna/counter/validation.py  |  5 ++---
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py
index 79226116..7df9cac5 100644
--- a/tests/unit_tests_validation.py
+++ b/tests/unit_tests_validation.py
@@ -299,7 +299,7 @@ class SamSqValidatorTest(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \
-                             "fields when counting in non-genomic mode.\n"
+                             "fields when performing sequence-based counting.\n"
 
         self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n"
     
diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
index 5d8f73c4..e7f6a2d8 100644
--- a/tiny/rna/configuration.py
+++ b/tiny/rna/configuration.py
@@ -2,6 +2,7 @@
 import argparse
 import shutil
 import errno
+import time
 import sys
 import csv
 import os
@@ -581,12 +582,20 @@ def get_gff_config(self) -> Dict[str, list]:
             gff_files[file] = sorted(set(alias), key=alias.index)
 
         if not len(gff_files) and not self.in_pipeline:
-            print("No GFF files were specified in {selfname} so "
-                  "reads will be counted in non-genomic mode."
-                  .format(selfname=self.basename))
+            self.print_sequence_counting_notice()
 
         return dict(gff_files)
 
+    def print_sequence_counting_notice(self):
+        print("No GFF files were specified in {selfname}.\n"
+              "Reads will be quantified by sequence "
+              "rather than by feature."
+              .format(selfname=self.basename))
+
+        for s in reversed(range(6)):
+            print(f"Proceeding in {s}s", end='\r')
+            time.sleep(1)
+
     # Override
     def append_to(self, key: str, val: Any):
         """Overrides method in the base class. This is necessary due to automatic
diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index 42b0d1d4..14aad110 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -33,9 +33,9 @@ def __init__(self, references, selection_rules, **prefs):
         self.selector = FeatureSelector(selection_rules, self.stats, **prefs)
 
         if isinstance(references, ReferenceFeatures):
-            self.mode = "genomic"
+            self.mode = "by feature"
         elif isinstance(references, ReferenceSeqs):
-            self.mode = "non-genomic"
+            self.mode = "by sequence"
         else:
             raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references))
 
diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index b8e4ba73..fdd6b6c5 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -772,7 +772,6 @@ def __init__(self, reference_seqs, **prefs):
         self.alias = {}
         self.tags = {}
 
-    @report_execution_time("Non-genomic annotations parsing")
     def get(self, selector):
         self.selector = selector
         match_tuples = self.get_matches()
@@ -785,7 +784,7 @@ def get(self, selector):
         return self.feats, aliases, self.tags
 
     def get_matches(self):
-        """Stage 1 selection is skipped in non-genomic counting.
+        """Stage 1 selection is skipped in sequence-based counting.
         Simply build match_tuples for all rules. These will be used
         uniformly in each reference sequence's feature_record_tuple"""
 
diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py
index 349fb615..1605b62b 100644
--- a/tiny/rna/counter/validation.py
+++ b/tiny/rna/counter/validation.py
@@ -295,7 +295,7 @@ def build_column_filters(rules):
 
 
 class SamSqValidator:
-    """Validates @SQ headers for tiny-count's non-genomic counting mode"""
+    """Validates @SQ headers for tiny-count's sequence-based counting mode"""
 
     targets = {
         "inter sq": "Sequence identifiers with inconsistent lengths",
@@ -310,7 +310,6 @@ def __init__(self, sam_files):
         self.reference_seqs = {}
         self.sq_headers = {}
 
-    @report_execution_time("Non-genomic annotations validation")
     def validate(self):
         print("Validating sequence identifiers in SAM files...")
         self.read_sq_headers()
@@ -391,7 +390,7 @@ def generate_header_syntax_report(self, missing, incomplete):
             report['incomplete sq'] = sorted_natural(incomplete)
 
         header = "Every SAM file must have complete @SQ headers with SN and LN\n" \
-                 "fields when counting in non-genomic mode.\n"
+                 "fields when performing sequence-based counting.\n"
         self.report.add_error_section(header, report)
 
     def generate_identifier_report(self, duplicate, ambiguous):