MontgomeryLab · taimontgomery · May 26, 2023 · May 25, 2023 · May 25, 2023 · May 25, 2023
diff --git a/README.md b/README.md
@@ -253,16 +253,20 @@ Diagnostic information can optionally be produced. If enabled, tiny-count will p
 
 The alignment tables (**... alignment_table.csv**) include the following information per-alignment:
 
-| Column            | Description                                                                                                                                                        |
-|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Sequence          | The read sequence, reverse complemented if antisense                                                                                                               |
-| Normalized Count  | The reads available for assignment (the sequence's original read count normalized by genomic hits)                                                                 |
-| Chrom             | The alignment's RNAME field                                                                                                                                        |
-| Strand            | The alignment's strand                                                                                                                                             |
-| Start             | The alignment's start coordinate                                                                                                                                   |
-| End               | The alignment's end coordinate                                                                                                                                     |
-| Candidates        | The number of features overlapping the alignment by at last one nucleotide                                                                                         |
-| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. |
+| Column           | Description                                                                                                                                                                                                                                                                           |
+|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Sequence         | The read sequence, reverse complemented if antisense                                                                                                                                                                                                                                  |
+| Raw Count        | The sequence's original read count                                                                                                                                                                                                                                                    |
+| Normalized Count | The reads available for assignment to features (the sequence's original read count optionally normalized by genomic hits)                                                                                                                                                             |
+| Genomic Hits     | The number of alignments produced for the sequence                                                                                                                                                                                                                                    |
+| Chrom            | The alignment's RNAME field                                                                                                                                                                                                                                                           |
+| Strand           | The alignment's strand                                                                                                                                                                                                                                                                |
+| Start            | The alignment's start coordinate                                                                                                                                                                                                                                                      |
+| End              | The alignment's end coordinate                                                                                                                                                                                                                                                        |
+| Mismatches       | The alignment's reported mismatches between the query sequence and the reference                                                                                                                                                                                                      |
+| Candidates       | The number of features overlapping the alignment by at last one nucleotide                                                                                                                                                                                                            |
+| Feature Hits     | The feature ID and assigning rule's classifier for all features assigned to the alignment, formatted as `(feature_id, classifier); (...)`. If the match was made by an unclassified rule then classifier is left empty. If no features were assigned the cell is left blank.          |
+| Feature Aliases  | The user-defined aliases for all features assigned to the alignment, formatted as `(alias1, alias2, ...); (...)` where the index of each alias grouping matches the index of the corresponding feature in the Feature Hits column. If a feature has no aliases then `()` is reported. |
 
 
 The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library:

diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
@@ -31,7 +31,6 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict
 class FeatureCounter:
 
     def __init__(self, references, selection_rules, **prefs):
-        self.stats = LibraryStats(**prefs)
         self.alignment_reader = AlignmentReader(**prefs)
         self.selector = FeatureSelector(selection_rules, **prefs)
 
@@ -43,6 +42,7 @@ def __init__(self, references, selection_rules, **prefs):
             raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references))
 
         Features(*references.get(self.selector))
+        self.stats = LibraryStats(Features, **prefs)
         self.prefs = prefs
 
     def count_reads(self, library: dict):

diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
@@ -25,9 +25,9 @@ class LibraryStats:
                           'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature',
                           'Reads Assigned to Multiple Features', 'Sequences Assigned to Multiple Features']
 
-    def __init__(self, **prefs):
+    def __init__(self, features, **prefs):
         self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'}
-        self.diags = Diagnostics() if prefs.get('report_diags') else None
+        self.diags = Diagnostics(features) if prefs.get('report_diags') else None
         self.norm_gh = prefs.get('normalize_by_genomic_hits', True)
         self.norm_fh = prefs.get('normalize_by_feature_hits', True)
 
@@ -53,6 +53,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict:
         return {
             'read_count': read_counts,
             'corr_count': corr_counts,
+            'loci_count': loci_counts,
             'assigned_ftags': set(),
             'assigned_reads': 0.0,
             'unassigned_reads': 0.0,
@@ -614,15 +615,17 @@ class Diagnostics:
     summary_categories = ['Eliminated counts', 'No feature counts',
                           'Uncounted alignments (+)', 'Uncounted alignments (-)']
 
-    alignment_columns =  ["Sequence", "Normalized Count", "Chrom", "Strand",
-                          "Start", "End", "Candidates", "Assigned Features"]
+    alignment_columns =  ["Sequence", "Raw Count", "Normalized Count", "Genomic Hits",
+                          "Chrom", "Strand", "Start", "End", "Mismatches",
+                          "Candidates", "Feature Hits", "Feature Aliases"]
 
     complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca')
     map_strand = {True: '+', False: '-', None: '.'}
 
-    def __init__(self):
+    def __init__(self, Features_obj):
         self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories}
         self.selection_diags = defaultdict(Counter)
+        self.aliases = Features_obj.aliases
         self.alignments = []
 
     def record_assignments(self, assignments, alignment, bundle, n_candidates):
@@ -640,16 +643,20 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates):
         strand = self.map_strand[aln['Strand']]
 
         # Perform reverse complement for anti-sense reads
-        read = aln['Seq'] if strand == '+' \
+        seq = aln['Seq'] if strand == '+' \
             else aln['Seq'][::-1].translate(self.complement)
 
-        # Indicate classifier in parentheses if present. Report NONE if no assignments
-        feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id
-                         for feat_id, tag in assignments) \
-                or "NONE"
+        # For easy parsing, report as: (id, classifier); ...
+        feature_hits = '; '.join(f"({fid}, {tag})" for fid, tag in assignments)
+
+        # For easy parsing, report as: (alias1, alias2, ...); ...
+        feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments]
+        feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases)
+
+        counts = (bundle['read_count'], bundle['corr_count'], bundle['loci_count'])
+        pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM'])
+        row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases)
 
-        # sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3
-        row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats)
         self.alignments.append(row)
 
     def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates):