Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,16 +253,20 @@ Diagnostic information can optionally be produced. If enabled, tiny-count will p

The alignment tables (**... alignment_table.csv**) include the following information per-alignment:

| Column | Description |
|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Sequence | The read sequence, reverse complemented if antisense |
| Normalized Count | The reads available for assignment (the sequence's original read count normalized by genomic hits) |
| Chrom | The alignment's RNAME field |
| Strand | The alignment's strand |
| Start | The alignment's start coordinate |
| End | The alignment's end coordinate |
| Candidates | The number of features overlapping the alignment by at last one nucleotide |
| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. |
| Column | Description |
|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Sequence | The read sequence, reverse complemented if antisense |
| Raw Count | The sequence's original read count |
| Normalized Count | The reads available for assignment to features (the sequence's original read count optionally normalized by genomic hits) |
| Genomic Hits | The number of alignments produced for the sequence |
| Chrom | The alignment's RNAME field |
| Strand | The alignment's strand |
| Start | The alignment's start coordinate |
| End | The alignment's end coordinate |
| Mismatches | The alignment's reported mismatches between the query sequence and the reference |
| Candidates | The number of features overlapping the alignment by at last one nucleotide |
| Feature Hits | The feature ID and assigning rule's classifier for all features assigned to the alignment, formatted as `(feature_id, classifier); (...)`. If the match was made by an unclassified rule then classifier is left empty. If no features were assigned the cell is left blank. |
| Feature Aliases | The user-defined aliases for all features assigned to the alignment, formatted as `(alias1, alias2, ...); (...)` where the index of each alias grouping matches the index of the corresponding feature in the Feature Hits column. If a feature has no aliases then `()` is reported. |


The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library:
Expand Down
2 changes: 1 addition & 1 deletion tiny/rna/counter/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict
class FeatureCounter:

def __init__(self, references, selection_rules, **prefs):
self.stats = LibraryStats(**prefs)
self.alignment_reader = AlignmentReader(**prefs)
self.selector = FeatureSelector(selection_rules, **prefs)

Expand All @@ -43,6 +42,7 @@ def __init__(self, references, selection_rules, **prefs):
raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references))

Features(*references.get(self.selector))
self.stats = LibraryStats(Features, **prefs)
self.prefs = prefs

def count_reads(self, library: dict):
Expand Down
31 changes: 19 additions & 12 deletions tiny/rna/counter/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ class LibraryStats:
'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature',
'Reads Assigned to Multiple Features', 'Sequences Assigned to Multiple Features']

def __init__(self, **prefs):
def __init__(self, features, **prefs):
self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'}
self.diags = Diagnostics() if prefs.get('report_diags') else None
self.diags = Diagnostics(features) if prefs.get('report_diags') else None
self.norm_gh = prefs.get('normalize_by_genomic_hits', True)
self.norm_fh = prefs.get('normalize_by_feature_hits', True)

Expand All @@ -53,6 +53,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict:
return {
'read_count': read_counts,
'corr_count': corr_counts,
'loci_count': loci_counts,
'assigned_ftags': set(),
'assigned_reads': 0.0,
'unassigned_reads': 0.0,
Expand Down Expand Up @@ -614,15 +615,17 @@ class Diagnostics:
summary_categories = ['Eliminated counts', 'No feature counts',
'Uncounted alignments (+)', 'Uncounted alignments (-)']

alignment_columns = ["Sequence", "Normalized Count", "Chrom", "Strand",
"Start", "End", "Candidates", "Assigned Features"]
alignment_columns = ["Sequence", "Raw Count", "Normalized Count", "Genomic Hits",
"Chrom", "Strand", "Start", "End", "Mismatches",
"Candidates", "Feature Hits", "Feature Aliases"]

complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca')
map_strand = {True: '+', False: '-', None: '.'}

def __init__(self):
def __init__(self, Features_obj):
self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories}
self.selection_diags = defaultdict(Counter)
self.aliases = Features_obj.aliases
self.alignments = []

def record_assignments(self, assignments, alignment, bundle, n_candidates):
Expand All @@ -640,16 +643,20 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates):
strand = self.map_strand[aln['Strand']]

# Perform reverse complement for anti-sense reads
read = aln['Seq'] if strand == '+' \
seq = aln['Seq'] if strand == '+' \
else aln['Seq'][::-1].translate(self.complement)

# Indicate classifier in parentheses if present. Report NONE if no assignments
feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id
for feat_id, tag in assignments) \
or "NONE"
# For easy parsing, report as: (id, classifier); ...
feature_hits = '; '.join(f"({fid}, {tag})" for fid, tag in assignments)

# For easy parsing, report as: (alias1, alias2, ...); ...
feat_aliases = [', '.join(self.aliases.get(fid, '')) for fid, _ in assignments]
feat_aliases = '; '.join(f"({aliases})" for aliases in feat_aliases)

counts = (bundle['read_count'], bundle['corr_count'], bundle['loci_count'])
pos = (aln['Chrom'], strand, aln['Start'], aln['End'], aln['NM'])
row = (seq, *counts, *pos, n_candidates, feature_hits, feat_aliases)

# sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3
row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats)
self.alignments.append(row)

def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates):
Expand Down