From b05250f940ef2b499d3ce83088bffaf11afb1d4f Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 25 May 2023 13:19:09 -0700 Subject: [PATCH 1/3] Updating diagnostic alignment tables. - Additional columns: raw count, genomic hits, mismatches, and feature aliases - Renaming assigned features to feature hits - Feature hits column is left empy if no features were assigned (rather than NONE) - Feature hits and feature aliases are reported with formatting that should be easier to parse if the user wants to analyze the alignment table with their own scripts --- tiny/rna/counter/features.py | 2 +- tiny/rna/counter/statistics.py | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 8eca57dc..bd32f353 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -31,7 +31,6 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict class FeatureCounter: def __init__(self, references, selection_rules, **prefs): - self.stats = LibraryStats(**prefs) self.alignment_reader = AlignmentReader(**prefs) self.selector = FeatureSelector(selection_rules, **prefs) @@ -43,6 +42,7 @@ def __init__(self, references, selection_rules, **prefs): raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references)) Features(*references.get(self.selector)) + self.stats = LibraryStats(Features, **prefs) self.prefs = prefs def count_reads(self, library: dict): diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index a1ec3902..0825b0f2 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -25,9 +25,9 @@ class LibraryStats: 'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature', 'Reads Assigned to Multiple Features', 'Sequences Assigned to Multiple Features'] - def __init__(self, **prefs): + def __init__(self, features, **prefs): self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'} - self.diags = Diagnostics() if prefs.get('report_diags') else None + self.diags = Diagnostics(features) if prefs.get('report_diags') else None self.norm_gh = prefs.get('normalize_by_genomic_hits', True) self.norm_fh = prefs.get('normalize_by_feature_hits', True) @@ -53,6 +53,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: return { 'read_count': read_counts, 'corr_count': corr_counts, + 'loci_count': loci_counts, 'assigned_ftags': set(), 'assigned_reads': 0.0, 'unassigned_reads': 0.0, @@ -614,15 +615,17 @@ class Diagnostics: summary_categories = ['Eliminated counts', 'No feature counts', 'Uncounted alignments (+)', 'Uncounted alignments (-)'] - alignment_columns = ["Sequence", "Normalized Count", "Chrom", "Strand", - "Start", "End", "Candidates", "Assigned Features"] + alignment_columns = ["Sequence", "Raw Count", "Normalized Count", "Genomic Hits", + "Chrom", "Strand", "Start", "End", "Mismatches", + "Candidates", "Feature Hits", "Feature Aliases"] complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca') map_strand = {True: '+', False: '-', None: '.'} - def __init__(self): + def __init__(self, Features_obj): self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories} self.selection_diags = defaultdict(Counter) + self.aliases = Features_obj.aliases self.alignments = [] def record_assignments(self, assignments, alignment, bundle, n_candidates): @@ -640,16 +643,20 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates): strand = self.map_strand[aln['Strand']] # Perform reverse complement for anti-sense reads - read = aln['Seq'] if strand == '+' \ + seq = aln['Seq'] if strand == '+' \ else aln['Seq'][::-1].translate(self.complement) - # Indicate classifier in parentheses if present. Report NONE if no assignments - feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id - for feat_id, tag in assignments) \ - or "NONE" + # For easy parsing, report as: (id, classifier); ... + feature_hits = '; '.join(f"({fid}, {tag})" for fid, tag in assignments) + + # For easy parsing, report as: (alias1, alias2, ...); ... + feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments] + feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases) + + counts = (bundle['corr_count'], bundle['read_count'], bundle['loci_count']) + pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM']) + row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases) - # sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3 - row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats) self.alignments.append(row) def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates): From a16630bfb3cbe6c2ca403a04aec9ed7241943628 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 25 May 2023 13:48:37 -0700 Subject: [PATCH 2/3] Updating diagnostics documentation --- README.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9bfdf6d0..94669a78 100644 --- a/README.md +++ b/README.md @@ -253,16 +253,20 @@ Diagnostic information can optionally be produced. If enabled, tiny-count will p The alignment tables (**... alignment_table.csv**) include the following information per-alignment: -| Column | Description | -|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Sequence | The read sequence, reverse complemented if antisense | -| Normalized Count | The reads available for assignment (the sequence's original read count normalized by genomic hits) | -| Chrom | The alignment's RNAME field | -| Strand | The alignment's strand | -| Start | The alignment's start coordinate | -| End | The alignment's end coordinate | -| Candidates | The number of features overlapping the alignment by at last one nucleotide | -| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. | +| Column | Description | +|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sequence | The read sequence, reverse complemented if antisense | +| Raw Count | The sequence's original read count | +| Normalized Count | The reads available for assignment to features (the sequence's original read count optionally normalized by genomic hits) | +| Genomic Hits | The number of alignments produced for the sequence | +| Chrom | The alignment's RNAME field | +| Strand | The alignment's strand | +| Start | The alignment's start coordinate | +| End | The alignment's end coordinate | +| Mismatches | The alignment's reported mismatches between the query sequence and the reference | +| Candidates | The number of features overlapping the alignment by at last one nucleotide | +| Feature Hits | The feature ID and assigning rule's classifier for all features assigned to the alignment, formatted as `(feature_id, classifier); (...)`. If the match was made by an unclassified rule then classifier is left empty. If no features were assigned the cell is left blank. | +| Feature Aliases | The user-defined aliases for all features assigned to the alignment, formatted as `(alias1, alias2, ...); (...)` where the index of each alias grouping matches the index of the corresponding feature in the Feature Hits column. If a feature has no aliases then `()` is reported. | The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library: From a6585d8ff29a9b05e2b2d7bcbebddff5eb6ee669 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 25 May 2023 13:52:19 -0700 Subject: [PATCH 3/3] Column order correction in alignment tables --- tiny/rna/counter/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 0825b0f2..aaa342dd 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -653,7 +653,7 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates): feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments] feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases) - counts = (bundle['corr_count'], bundle['read_count'], bundle['loci_count']) + counts = (bundle['read_count'], bundle['corr_count'], bundle['loci_count']) pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM']) row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases)