From 03d6cecd57ff5629860b69d4d99bf7654393edae Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 4 Apr 2023 18:20:18 -0700
Subject: [PATCH 1/3] Major updates to diagnostics classes. -
 Classifier-related bugs have been fixed - Alignment tables are now produced
 as CSVs rather than TSVs - Alignment tables now include the alignment's
 chromosome as well as the number of feature candidates that were available
 for Stage 2 selection

---
 tiny/rna/counter/statistics.py | 70 ++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
index a3ebe0f9..59090232 100644
--- a/tiny/rna/counter/statistics.py
+++ b/tiny/rna/counter/statistics.py
@@ -83,8 +83,7 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n
                     self.rule_counts[rule] += rcorr_count
 
         if self.diags is not None:
-            self.diags.record_diagnostics(assignments.keys(), n_candidates, aln, bundle)
-            self.diags.record_alignment_details(aln, bundle, assignments.keys())
+            self.diags.record_assignments(assignments.keys(), aln, bundle, n_candidates)
 
     def finalize_bundle(self, bundle: dict) -> None:
         """Called at the conclusion of processing each multiple-alignment bundle"""
@@ -464,75 +463,90 @@ def get_mapped_reads(other: LibraryStats):
 
 class Diagnostics:
 
-    aln_diag_categories = ['Eliminated counts', 'No feature counts',
-                           'Uncounted alignments (+)', 'Uncounted alignments (-)']
+    summary_categories = ['Eliminated counts', 'No feature counts',
+                          'Uncounted alignments (+)', 'Uncounted alignments (-)']
+
+    alignment_columns =  ["Sequence", "Normalized Count", "Chrom", "Strand",
+                          "Start", "End", "Candidates", "Assigned Features"]
 
     complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca')
+    map_strand = {True: '+', False: '-', None: '.'}
 
     def __init__(self, out_prefix: str):
         self.prefix = out_prefix
-        self.alignment_diags = {stat: 0 for stat in Diagnostics.aln_diag_categories}
+        self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories}
         self.selection_diags = defaultdict(Counter)
         self.alignments = []
 
-    def record_alignment_details(self, aln, bundle, assignments):
+    def record_assignments(self, assignments, alignment, bundle, n_candidates):
+        self.record_alignment_details(assignments, alignment, bundle, n_candidates)
+        self.record_summary_diagnostics(assignments, alignment, bundle, n_candidates)
+
+    def record_alignment_details(self, assignments, aln, bundle, n_candidates):
         """Record detailed alignment info if user elects to save diagnostics info with the run
 
         This is called once per locus per read (every alignment) when the user elects to save
         diagnostics. The recorded information is later written to {library['Name']}_aln_table.txt
         after the entire SAM file has been processed."""
 
+        # Map internal strand representation to +/-/.
+        strand = self.map_strand[aln['Strand']]
+
         # Perform reverse complement for anti-sense reads
-        read = aln['Seq'] \
-            if aln['Strand'] is True \
+        read = aln['Seq'] if strand == '+' \
             else aln['Seq'][::-1].translate(self.complement)
 
-        # sequence, cor_counts, strand, start, end, feat1;feat2;feat3
-        self.alignments.append((read, bundle['corr_count'], aln['Strand'], aln['Start'], aln['End'],
-                                ';'.join(assignments)))
+        # Indicate classifier in parentheses if present. Report NONE if no assignments
+        feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id
+                         for feat_id, tag in assignments) \
+                or "NONE"
+
+        # sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3
+        row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats)
+        self.alignments.append(row)
 
-    def record_diagnostics(self, assignments, n_candidates, aln, bundle):
+    def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates):
         """Records basic diagnostic info"""
 
         if len(assignments) == 0:
             if aln['Strand'] is True:
-                self.alignment_diags['Uncounted alignments (+)'] += 1
+                self.assignment_diags['Uncounted alignments (+)'] += 1
             else:
-                self.alignment_diags['Uncounted alignments (-)'] += 1
+                self.assignment_diags['Uncounted alignments (-)'] += 1
             if n_candidates == 0:
-                self.alignment_diags['No feature counts'] += bundle['corr_count']
+                self.assignment_diags['No feature counts'] += bundle['corr_count']
             else:
-                self.alignment_diags['Eliminated counts'] += bundle['corr_count']
+                self.assignment_diags['Eliminated counts'] += bundle['corr_count']
 
 
 class MergedDiags(MergedStat):
     def __init__(self):
-        self.aln_diags = pd.DataFrame(columns=Diagnostics.aln_diag_categories)
+        self.assignment_diags = pd.DataFrame(index=Diagnostics.summary_categories)
         self.selection_diags = {}
         self.alignment_tables = {}
 
     def add_library(self, other: LibraryStats):
         other_lib = other.library['Name']
         self.alignment_tables[other_lib] = other.diags.alignments
-        self.selection_diags[other_lib] = other.diags.selection_diags
-        self.aln_diags.loc[other_lib] = other.diags.alignment_diags
+        # self.selection_diags[other_lib] = other.diags.selection_diags  Not currently collected
+        self.assignment_diags[other_lib] = self.assignment_diags.index.map(other.diags.assignment_diags)
 
     def write_output_logfile(self):
-        self.write_alignment_diags()
+        self.write_assignment_diags()
         # self.write_selection_diags()  Not currently collected
         self.write_alignment_tables()
 
-    def write_alignment_diags(self):
-        MergedStat.df_to_csv(self.aln_diags, "Sample", self.prefix, "alignment_diags", sort_axis="index")
+    def write_assignment_diags(self):
+        MergedStat.df_to_csv(self.assignment_diags, "Assignment Diags", self.prefix, "assignment_diags")
 
     def write_alignment_tables(self):
+        header = Diagnostics.alignment_columns
         for library_name, table in self.alignment_tables.items():
-            outfile = make_filename([self.prefix, library_name, 'aln_table'], ext='.txt')
-            with open(outfile, 'w') as imf:
-                imf.writelines(
-                    # sequence, cor_counts, strand, start, end, feat1a/feat1b;feat2;...
-                    map(lambda rec: "%s\t%f\t%c\t%d\t%d\t%s\n" % rec, table)
-                )
+            outfile = make_filename([self.prefix, library_name, 'alignment_table'], ext='.csv')
+            with open(outfile, 'w') as ao:
+                csv_writer = csv.writer(ao)
+                csv_writer.writerow(header)
+                csv_writer.writerows(table)
 
     def write_selection_diags(self):
         out = []

From 15f9fe6cd9eef4e2009581376f73b43f56e368b0 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 4 Apr 2023 18:21:04 -0700
Subject: [PATCH 2/3] CWL updates for the new diagnostic output filenames

---
 tiny/cwl/tools/tiny-count.cwl     | 8 ++++----
 tiny/cwl/workflows/tinyrna_wf.cwl | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl
index 0541e695..7f9521a9 100644
--- a/tiny/cwl/tools/tiny-count.cwl
+++ b/tiny/cwl/tools/tiny-count.cwl
@@ -125,15 +125,15 @@ outputs:
     outputBinding:
       glob: "*_decollapsed.sam"
 
-  intermed_out_files:
+  alignment_tables:
     type: File[]?
     outputBinding:
-      glob: "*_aln_table.txt"
+      glob: "*_alignment_table.csv"
 
-  alignment_diags:
+  assignment_diags:
     type: File?
     outputBinding:
-      glob: $(inputs.out_prefix)_alignment_diags.csv
+      glob: $(inputs.out_prefix)_assignment_diags.csv
 
   selection_diags:
     type: File?
diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl
index ebe73510..8a3f9f27 100644
--- a/tiny/cwl/workflows/tinyrna_wf.cwl
+++ b/tiny/cwl/workflows/tinyrna_wf.cwl
@@ -224,8 +224,8 @@ steps:
       fastp_logs: preprocessing/json_report_file
       collapsed_fa: preprocessing/uniq_seqs
     out: [ feature_counts, rule_counts, norm_counts, mapped_nt_len_dist, assigned_nt_len_dist,
-           alignment_stats, summary_stats, decollapsed_sams, intermed_out_files,
-           alignment_diags, selection_diags ]
+           alignment_stats, summary_stats, decollapsed_sams, alignment_tables,
+           assignment_diags, selection_diags ]
 
   tiny-deseq:
     run: ../tools/tiny-deseq.cwl
@@ -321,7 +321,7 @@ steps:
         source: [ tiny-count/feature_counts, tiny-count/rule_counts, tiny-count/norm_counts,
                   tiny-count/mapped_nt_len_dist, tiny-count/assigned_nt_len_dist,
                   tiny-count/alignment_stats, tiny-count/summary_stats, tiny-count/decollapsed_sams,
-                  tiny-count/alignment_diags, tiny-count/selection_diags, tiny-count/intermed_out_files,
+                  tiny-count/assignment_diags, tiny-count/selection_diags, tiny-count/alignment_tables,
                   features_csv ]
       dir_name: dir_name_tiny-count
     out: [ subdir ]

From 73b7f47ab9bd5720d6913dd779c8b51fd267a3eb Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 4 Apr 2023 18:22:06 -0700
Subject: [PATCH 3/3] Documentation updates for diagnostic outputs

---
 README.md                                     | 21 +++++++++++++------
 START_HERE/run_config.yml                     |  2 +-
 .../config_files/run_config_template.yml      |  2 +-
 tiny/templates/run_config_template.yml        |  2 +-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 7e375f3b..887e5370 100644
--- a/README.md
+++ b/README.md
@@ -239,14 +239,23 @@ Two tables are produced:
 
 #### Diagnostics
 
-Diagnostic information will include intermediate alignment files for each library and an additional stats table with information about counts that were not assigned to a feature. Intermediate alignment files include the following information about each alignment, regardless of feature assignment status:
+Diagnostic information can optionally be produced. If enabled, tiny-count will provide a table for each library which contains feature assignment info on a per-alignment basis, along with a single table that summarizes unassigned counts.
 
-- The alignment's SEQ field, reverse complemented for the - strand
-- The sequence's count normalized by its multi-alignment locus count
-- Feature IDs of all features assigned to the alignment
-- Strand, start, and end position
+The alignment tables (**... alignment_table.csv**) include the following information per-alignment:
 
-The unassigned counts table includes the following, with a column per library:
+| Column            | Description                                                                                                                                                        |
+|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Sequence          | The read sequence, reverse complemented if antisense                                                                                                               |
+| Normalized Count  | The reads available for assignment (the sequence's original read count normalized by genomic hits)                                                                 |
+| Chrom             | The alignment's RNAME field                                                                                                                                        |
+| Strand            | The alignment's strand                                                                                                                                             |
+| Start             | The alignment's start coordinate                                                                                                                                   |
+| End               | The alignment's end coordinate                                                                                                                                     |
+| Candidates        | The number of features overlapping the alignment by at last one nucleotide                                                                                         |
+| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. |
+
+
+The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library:
 
 | Stat                              | Description                                                                                                                                   |
 |-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
index e5fd45d2..85e6c23a 100644
--- a/START_HERE/run_config.yml
+++ b/START_HERE/run_config.yml
@@ -235,7 +235,7 @@ counter_decollapse: False
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 
-##-- If True: produce diagnostic logs to indicate what was eliminated and why --##
+##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --##
 counter_diags: False
 
 
diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml
index cc303f0f..56d86210 100644
--- a/tests/testdata/config_files/run_config_template.yml
+++ b/tests/testdata/config_files/run_config_template.yml
@@ -235,7 +235,7 @@ counter_decollapse: False
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 
-##-- If True: produce diagnostic logs to indicate what was eliminated and why --##
+##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --##
 counter_diags: False
 
 
diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml
index 00328a4b..90bc5a28 100644
--- a/tiny/templates/run_config_template.yml
+++ b/tiny/templates/run_config_template.yml
@@ -235,7 +235,7 @@ counter_decollapse: False
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 
-##-- If True: produce diagnostic logs to indicate what was eliminated and why --##
+##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --##
 counter_diags: False