From b05250f940ef2b499d3ce83088bffaf11afb1d4f Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Thu, 25 May 2023 13:19:09 -0700
Subject: [PATCH 1/3] Updating diagnostic alignment tables. - Additional
 columns: raw count, genomic hits, mismatches, and feature aliases - Renaming
 assigned features to feature hits - Feature hits column is left empy if no
 features were assigned (rather than NONE) - Feature hits and feature aliases
 are reported with formatting that should be easier to parse if the user wants
 to analyze the alignment table with their own scripts

---
 tiny/rna/counter/features.py   |  2 +-
 tiny/rna/counter/statistics.py | 31 +++++++++++++++++++------------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index 8eca57dc..bd32f353 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -31,7 +31,6 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict
 class FeatureCounter:
 
     def __init__(self, references, selection_rules, **prefs):
-        self.stats = LibraryStats(**prefs)
         self.alignment_reader = AlignmentReader(**prefs)
         self.selector = FeatureSelector(selection_rules, **prefs)
 
@@ -43,6 +42,7 @@ def __init__(self, references, selection_rules, **prefs):
             raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references))
 
         Features(*references.get(self.selector))
+        self.stats = LibraryStats(Features, **prefs)
         self.prefs = prefs
 
     def count_reads(self, library: dict):
diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
index a1ec3902..0825b0f2 100644
--- a/tiny/rna/counter/statistics.py
+++ b/tiny/rna/counter/statistics.py
@@ -25,9 +25,9 @@ class LibraryStats:
                           'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature',
                           'Reads Assigned to Multiple Features', 'Sequences Assigned to Multiple Features']
 
-    def __init__(self, **prefs):
+    def __init__(self, features, **prefs):
         self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'}
-        self.diags = Diagnostics() if prefs.get('report_diags') else None
+        self.diags = Diagnostics(features) if prefs.get('report_diags') else None
         self.norm_gh = prefs.get('normalize_by_genomic_hits', True)
         self.norm_fh = prefs.get('normalize_by_feature_hits', True)
 
@@ -53,6 +53,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict:
         return {
             'read_count': read_counts,
             'corr_count': corr_counts,
+            'loci_count': loci_counts,
             'assigned_ftags': set(),
             'assigned_reads': 0.0,
             'unassigned_reads': 0.0,
@@ -614,15 +615,17 @@ class Diagnostics:
     summary_categories = ['Eliminated counts', 'No feature counts',
                           'Uncounted alignments (+)', 'Uncounted alignments (-)']
 
-    alignment_columns =  ["Sequence", "Normalized Count", "Chrom", "Strand",
-                          "Start", "End", "Candidates", "Assigned Features"]
+    alignment_columns =  ["Sequence", "Raw Count", "Normalized Count", "Genomic Hits",
+                          "Chrom", "Strand", "Start", "End", "Mismatches",
+                          "Candidates", "Feature Hits", "Feature Aliases"]
 
     complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca')
     map_strand = {True: '+', False: '-', None: '.'}
 
-    def __init__(self):
+    def __init__(self, Features_obj):
         self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories}
         self.selection_diags = defaultdict(Counter)
+        self.aliases = Features_obj.aliases
         self.alignments = []
 
     def record_assignments(self, assignments, alignment, bundle, n_candidates):
@@ -640,16 +643,20 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates):
         strand = self.map_strand[aln['Strand']]
 
         # Perform reverse complement for anti-sense reads
-        read = aln['Seq'] if strand == '+' \
+        seq = aln['Seq'] if strand == '+' \
             else aln['Seq'][::-1].translate(self.complement)
 
-        # Indicate classifier in parentheses if present. Report NONE if no assignments
-        feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id
-                         for feat_id, tag in assignments) \
-                or "NONE"
+        # For easy parsing, report as: (id, classifier); ...
+        feature_hits = '; '.join(f"({fid}, {tag})" for fid, tag in assignments)
+
+        # For easy parsing, report as: (alias1, alias2, ...); ...
+        feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments]
+        feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases)
+
+        counts = (bundle['corr_count'], bundle['read_count'], bundle['loci_count'])
+        pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM'])
+        row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases)
 
-        # sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3
-        row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats)
         self.alignments.append(row)
 
     def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates):

From a16630bfb3cbe6c2ca403a04aec9ed7241943628 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Thu, 25 May 2023 13:48:37 -0700
Subject: [PATCH 2/3] Updating diagnostics documentation

---
 README.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 9bfdf6d0..94669a78 100644
--- a/README.md
+++ b/README.md
@@ -253,16 +253,20 @@ Diagnostic information can optionally be produced. If enabled, tiny-count will p
 
 The alignment tables (**... alignment_table.csv**) include the following information per-alignment:
 
-| Column            | Description                                                                                                                                                        |
-|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Sequence          | The read sequence, reverse complemented if antisense                                                                                                               |
-| Normalized Count  | The reads available for assignment (the sequence's original read count normalized by genomic hits)                                                                 |
-| Chrom             | The alignment's RNAME field                                                                                                                                        |
-| Strand            | The alignment's strand                                                                                                                                             |
-| Start             | The alignment's start coordinate                                                                                                                                   |
-| End               | The alignment's end coordinate                                                                                                                                     |
-| Candidates        | The number of features overlapping the alignment by at last one nucleotide                                                                                         |
-| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. |
+| Column           | Description                                                                                                                                                                                                                                                                           |
+|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Sequence         | The read sequence, reverse complemented if antisense                                                                                                                                                                                                                                  |
+| Raw Count        | The sequence's original read count                                                                                                                                                                                                                                                    |
+| Normalized Count | The reads available for assignment to features (the sequence's original read count optionally normalized by genomic hits)                                                                                                                                                             |
+| Genomic Hits     | The number of alignments produced for the sequence                                                                                                                                                                                                                                    |
+| Chrom            | The alignment's RNAME field                                                                                                                                                                                                                                                           |
+| Strand           | The alignment's strand                                                                                                                                                                                                                                                                |
+| Start            | The alignment's start coordinate                                                                                                                                                                                                                                                      |
+| End              | The alignment's end coordinate                                                                                                                                                                                                                                                        |
+| Mismatches       | The alignment's reported mismatches between the query sequence and the reference                                                                                                                                                                                                      |
+| Candidates       | The number of features overlapping the alignment by at last one nucleotide                                                                                                                                                                                                            |
+| Feature Hits     | The feature ID and assigning rule's classifier for all features assigned to the alignment, formatted as `(feature_id, classifier); (...)`. If the match was made by an unclassified rule then classifier is left empty. If no features were assigned the cell is left blank.          |
+| Feature Aliases  | The user-defined aliases for all features assigned to the alignment, formatted as `(alias1, alias2, ...); (...)` where the index of each alias grouping matches the index of the corresponding feature in the Feature Hits column. If a feature has no aliases then `()` is reported. |
 
 
 The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library:

From a6585d8ff29a9b05e2b2d7bcbebddff5eb6ee669 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Thu, 25 May 2023 13:52:19 -0700
Subject: [PATCH 3/3] Column order correction in alignment tables

---
 tiny/rna/counter/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
index 0825b0f2..aaa342dd 100644
--- a/tiny/rna/counter/statistics.py
+++ b/tiny/rna/counter/statistics.py
@@ -653,7 +653,7 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates):
         feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments]
         feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases)
 
-        counts = (bundle['corr_count'], bundle['read_count'], bundle['loci_count'])
+        counts = (bundle['read_count'], bundle['corr_count'], bundle['loci_count'])
         pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM'])
         row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases)