From 03d6cecd57ff5629860b69d4d99bf7654393edae Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 18:20:18 -0700 Subject: [PATCH 1/3] Major updates to diagnostics classes. - Classifier-related bugs have been fixed - Alignment tables are now produced as CSVs rather than TSVs - Alignment tables now include the alignment's chromosome as well as the number of feature candidates that were available for Stage 2 selection --- tiny/rna/counter/statistics.py | 70 ++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index a3ebe0f9..59090232 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -83,8 +83,7 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n self.rule_counts[rule] += rcorr_count if self.diags is not None: - self.diags.record_diagnostics(assignments.keys(), n_candidates, aln, bundle) - self.diags.record_alignment_details(aln, bundle, assignments.keys()) + self.diags.record_assignments(assignments.keys(), aln, bundle, n_candidates) def finalize_bundle(self, bundle: dict) -> None: """Called at the conclusion of processing each multiple-alignment bundle""" @@ -464,75 +463,90 @@ def get_mapped_reads(other: LibraryStats): class Diagnostics: - aln_diag_categories = ['Eliminated counts', 'No feature counts', - 'Uncounted alignments (+)', 'Uncounted alignments (-)'] + summary_categories = ['Eliminated counts', 'No feature counts', + 'Uncounted alignments (+)', 'Uncounted alignments (-)'] + + alignment_columns = ["Sequence", "Normalized Count", "Chrom", "Strand", + "Start", "End", "Candidates", "Assigned Features"] complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca') + map_strand = {True: '+', False: '-', None: '.'} def __init__(self, out_prefix: str): self.prefix = out_prefix - self.alignment_diags = {stat: 0 for stat in Diagnostics.aln_diag_categories} + self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories} self.selection_diags = defaultdict(Counter) self.alignments = [] - def record_alignment_details(self, aln, bundle, assignments): + def record_assignments(self, assignments, alignment, bundle, n_candidates): + self.record_alignment_details(assignments, alignment, bundle, n_candidates) + self.record_summary_diagnostics(assignments, alignment, bundle, n_candidates) + + def record_alignment_details(self, assignments, aln, bundle, n_candidates): """Record detailed alignment info if user elects to save diagnostics info with the run This is called once per locus per read (every alignment) when the user elects to save diagnostics. The recorded information is later written to {library['Name']}_aln_table.txt after the entire SAM file has been processed.""" + # Map internal strand representation to +/-/. + strand = self.map_strand[aln['Strand']] + # Perform reverse complement for anti-sense reads - read = aln['Seq'] \ - if aln['Strand'] is True \ + read = aln['Seq'] if strand == '+' \ else aln['Seq'][::-1].translate(self.complement) - # sequence, cor_counts, strand, start, end, feat1;feat2;feat3 - self.alignments.append((read, bundle['corr_count'], aln['Strand'], aln['Start'], aln['End'], - ';'.join(assignments))) + # Indicate classifier in parentheses if present. Report NONE if no assignments + feats = ';'.join(f"{feat_id}({tag})" if tag else feat_id + for feat_id, tag in assignments) \ + or "NONE" + + # sequence, cor_counts, chrom, strand, start, end, candidates, feat1;feat2;feat3 + row = (read, bundle['corr_count'], aln['Chrom'], strand, aln['Start'], aln['End'], n_candidates, feats) + self.alignments.append(row) - def record_diagnostics(self, assignments, n_candidates, aln, bundle): + def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates): """Records basic diagnostic info""" if len(assignments) == 0: if aln['Strand'] is True: - self.alignment_diags['Uncounted alignments (+)'] += 1 + self.assignment_diags['Uncounted alignments (+)'] += 1 else: - self.alignment_diags['Uncounted alignments (-)'] += 1 + self.assignment_diags['Uncounted alignments (-)'] += 1 if n_candidates == 0: - self.alignment_diags['No feature counts'] += bundle['corr_count'] + self.assignment_diags['No feature counts'] += bundle['corr_count'] else: - self.alignment_diags['Eliminated counts'] += bundle['corr_count'] + self.assignment_diags['Eliminated counts'] += bundle['corr_count'] class MergedDiags(MergedStat): def __init__(self): - self.aln_diags = pd.DataFrame(columns=Diagnostics.aln_diag_categories) + self.assignment_diags = pd.DataFrame(index=Diagnostics.summary_categories) self.selection_diags = {} self.alignment_tables = {} def add_library(self, other: LibraryStats): other_lib = other.library['Name'] self.alignment_tables[other_lib] = other.diags.alignments - self.selection_diags[other_lib] = other.diags.selection_diags - self.aln_diags.loc[other_lib] = other.diags.alignment_diags + # self.selection_diags[other_lib] = other.diags.selection_diags Not currently collected + self.assignment_diags[other_lib] = self.assignment_diags.index.map(other.diags.assignment_diags) def write_output_logfile(self): - self.write_alignment_diags() + self.write_assignment_diags() # self.write_selection_diags() Not currently collected self.write_alignment_tables() - def write_alignment_diags(self): - MergedStat.df_to_csv(self.aln_diags, "Sample", self.prefix, "alignment_diags", sort_axis="index") + def write_assignment_diags(self): + MergedStat.df_to_csv(self.assignment_diags, "Assignment Diags", self.prefix, "assignment_diags") def write_alignment_tables(self): + header = Diagnostics.alignment_columns for library_name, table in self.alignment_tables.items(): - outfile = make_filename([self.prefix, library_name, 'aln_table'], ext='.txt') - with open(outfile, 'w') as imf: - imf.writelines( - # sequence, cor_counts, strand, start, end, feat1a/feat1b;feat2;... - map(lambda rec: "%s\t%f\t%c\t%d\t%d\t%s\n" % rec, table) - ) + outfile = make_filename([self.prefix, library_name, 'alignment_table'], ext='.csv') + with open(outfile, 'w') as ao: + csv_writer = csv.writer(ao) + csv_writer.writerow(header) + csv_writer.writerows(table) def write_selection_diags(self): out = [] From 15f9fe6cd9eef4e2009581376f73b43f56e368b0 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 18:21:04 -0700 Subject: [PATCH 2/3] CWL updates for the new diagnostic output filenames --- tiny/cwl/tools/tiny-count.cwl | 8 ++++---- tiny/cwl/workflows/tinyrna_wf.cwl | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index 0541e695..7f9521a9 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -125,15 +125,15 @@ outputs: outputBinding: glob: "*_decollapsed.sam" - intermed_out_files: + alignment_tables: type: File[]? outputBinding: - glob: "*_aln_table.txt" + glob: "*_alignment_table.csv" - alignment_diags: + assignment_diags: type: File? outputBinding: - glob: $(inputs.out_prefix)_alignment_diags.csv + glob: $(inputs.out_prefix)_assignment_diags.csv selection_diags: type: File? diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index ebe73510..8a3f9f27 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -224,8 +224,8 @@ steps: fastp_logs: preprocessing/json_report_file collapsed_fa: preprocessing/uniq_seqs out: [ feature_counts, rule_counts, norm_counts, mapped_nt_len_dist, assigned_nt_len_dist, - alignment_stats, summary_stats, decollapsed_sams, intermed_out_files, - alignment_diags, selection_diags ] + alignment_stats, summary_stats, decollapsed_sams, alignment_tables, + assignment_diags, selection_diags ] tiny-deseq: run: ../tools/tiny-deseq.cwl @@ -321,7 +321,7 @@ steps: source: [ tiny-count/feature_counts, tiny-count/rule_counts, tiny-count/norm_counts, tiny-count/mapped_nt_len_dist, tiny-count/assigned_nt_len_dist, tiny-count/alignment_stats, tiny-count/summary_stats, tiny-count/decollapsed_sams, - tiny-count/alignment_diags, tiny-count/selection_diags, tiny-count/intermed_out_files, + tiny-count/assignment_diags, tiny-count/selection_diags, tiny-count/alignment_tables, features_csv ] dir_name: dir_name_tiny-count out: [ subdir ] From 73b7f47ab9bd5720d6913dd779c8b51fd267a3eb Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 4 Apr 2023 18:22:06 -0700 Subject: [PATCH 3/3] Documentation updates for diagnostic outputs --- README.md | 21 +++++++++++++------ START_HERE/run_config.yml | 2 +- .../config_files/run_config_template.yml | 2 +- tiny/templates/run_config_template.yml | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7e375f3b..887e5370 100644 --- a/README.md +++ b/README.md @@ -239,14 +239,23 @@ Two tables are produced: #### Diagnostics -Diagnostic information will include intermediate alignment files for each library and an additional stats table with information about counts that were not assigned to a feature. Intermediate alignment files include the following information about each alignment, regardless of feature assignment status: +Diagnostic information can optionally be produced. If enabled, tiny-count will provide a table for each library which contains feature assignment info on a per-alignment basis, along with a single table that summarizes unassigned counts. -- The alignment's SEQ field, reverse complemented for the - strand -- The sequence's count normalized by its multi-alignment locus count -- Feature IDs of all features assigned to the alignment -- Strand, start, and end position +The alignment tables (**... alignment_table.csv**) include the following information per-alignment: -The unassigned counts table includes the following, with a column per library: +| Column | Description | +|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sequence | The read sequence, reverse complemented if antisense | +| Normalized Count | The reads available for assignment (the sequence's original read count normalized by genomic hits) | +| Chrom | The alignment's RNAME field | +| Strand | The alignment's strand | +| Start | The alignment's start coordinate | +| End | The alignment's end coordinate | +| Candidates | The number of features overlapping the alignment by at last one nucleotide | +| Assigned Features | Feature IDs of all features assigned to the alignment. `NONE` if no features were assigned. If the assigning rule has a classifier, it is included in parentheses. | + + +The unassigned counts table (**assignment_diags.csv**) includes the following, with a column per library: | Stat | Description | |-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index e5fd45d2..85e6c23a 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -235,7 +235,7 @@ counter_decollapse: False ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --## counter_stepvector: 'Cython' -##-- If True: produce diagnostic logs to indicate what was eliminated and why --## +##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --## counter_diags: False diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml index cc303f0f..56d86210 100644 --- a/tests/testdata/config_files/run_config_template.yml +++ b/tests/testdata/config_files/run_config_template.yml @@ -235,7 +235,7 @@ counter_decollapse: False ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --## counter_stepvector: 'Cython' -##-- If True: produce diagnostic logs to indicate what was eliminated and why --## +##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --## counter_diags: False diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 00328a4b..90bc5a28 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -235,7 +235,7 @@ counter_decollapse: False ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --## counter_stepvector: 'Cython' -##-- If True: produce diagnostic logs to indicate what was eliminated and why --## +##-- If True: produce alignment tables and assignment diagnostics for uncounted reads --## counter_diags: False