From 41b819fe65e9bc47a3d1eda3fea835144ec2f1cb Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 5 Apr 2023 19:12:17 -0700 Subject: [PATCH 01/34] Preliminary refactor and addition of normalization options to accommodate normalization by genomic hits --- tiny/rna/counter/statistics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index a3ebe0f9..61c4352b 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -27,7 +27,8 @@ def __init__(self, out_prefix: str = None, report_diags: bool = False, **prefs): self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'} self.out_prefix = out_prefix self.diags = Diagnostics(out_prefix) if report_diags else None - self.norm = prefs.get('normalize_by_hits', True) + self.norm_gh = prefs.get('normalize_by_genomic_hits', True) + self.norm_fh = prefs.get('normalize_by_feature_hits', True) self.feat_counts = Counter() self.rule_counts = Counter() @@ -44,7 +45,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) - corr_counts = read_counts / loci_counts + corr_counts = read_counts / loci_counts if self.norm_gh else read_counts nt5, seqlen = bundle_read['nt5end'], bundle_read['Length'] # Fill in 5p nt/length matrix @@ -72,7 +73,7 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n if asgn_count == 0: self.library_stats['Total Unassigned Reads'] += corr_count else: - fcorr_count = corr_count / asgn_count if self.norm else corr_count + fcorr_count = corr_count / asgn_count if self.norm_fh else corr_count bundle['assigned_reads'] += fcorr_count * asgn_count bundle['assigned_ftags'] |= assignments.keys() From ba2fb5eb8784c14abc9e43b5489d5098e6b6866e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 08:45:12 -0700 Subject: [PATCH 02/34] Adding command line option to tiny-count, CWL, and the Run Config for disabling normalization by genomic hits. I renamed "normalize hits" to "normalize by feature hits" to disambiguate and make consistent with the new option --- START_HERE/run_config.yml | 8 +++++--- tiny/cwl/tools/tiny-count.cwl | 9 +++++++-- tiny/cwl/workflows/tinyrna_wf.cwl | 10 +++++++--- tiny/rna/counter/counter.py | 13 ++++++++----- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index e5fd45d2..d8a56c1b 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -225,9 +225,11 @@ shared_memory: False ##-- If True: show all parsed features in the counts csv, regardless of count/identity --## counter_all_features: False -##-- If True: counts will be normalized by genomic hits AND selected feature count --## -##-- If False: counts will only be normalized by genomic hits --## -counter_normalize_by_hits: True +##-- If True: counts are normalized by genomic hits (number of multi-alignments) --## +counter_normalize_by_genomic_hits: False + +##-- If True: counts are normalized by feature hits (selected feature count per-locus) --## +counter_normalize_by_feature_hits: True ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --## counter_decollapse: False diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index 0541e695..77a15074 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -30,10 +30,15 @@ inputs: # Optional inputs - normalize_by_hits: + normalize_by_feature_hits: type: string? inputBinding: - prefix: --normalize-by-hits + prefix: --normalize-by-feature-hits + + normalize_by_genomic_hits: + type: string? + inputBinding: + prefix: --normalize-by-genomic-hits decollapse: type: boolean? diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index ebe73510..2032f021 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -87,7 +87,8 @@ inputs: counter_decollapse: boolean? counter_stepvector: string? counter_all_features: boolean? - counter_normalize_by_hits: boolean? + counter_normalize_by_feature_hits: boolean? + counter_normalize_by_genomic_hits: boolean? # tiny-deseq inputs run_deseq: boolean @@ -214,8 +215,11 @@ steps: gff_files: gff_files out_prefix: run_name all_features: counter_all_features - normalize_by_hits: - source: counter_normalize_by_hits + normalize_by_feature_hits: + source: counter_normalize_by_feature_hits + valueFrom: $(String(self)) # convert boolean -> string + normalize_by_genomic_hits: + source: counter_normalize_by_genomic_hits valueFrom: $(String(self)) # convert boolean -> string decollapse: counter_decollapse stepvector: counter_stepvector diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 9c4a37b3..4b3b0648 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -53,7 +53,9 @@ def get_args(): optional_args.add_argument('-o', '--out-prefix', metavar='PREFIX', default='tiny-count_{timestamp}', help='The output prefix to use for file names. All occurrences of the ' 'substring {timestamp} will be replaced with the current date and time.') - optional_args.add_argument('-nh', '--normalize-by-hits', metavar='T/F', default='T', + optional_args.add_argument('-ng', '--normalize-by-genomic-hits', metavar='T/F', default='T', + help='If T/true, normalize counts by genomic hits.') + optional_args.add_argument('-nh', '--normalize-by-feature-hits', metavar='T/F', default='T', help='If T/true, normalize counts by (selected) overlapping feature counts.') optional_args.add_argument('-dc', '--decollapse', action='store_true', help='Create a decollapsed copy of all SAM files listed in your Samples Sheet. ' @@ -80,7 +82,8 @@ def get_args(): else: args_dict = vars(args) args_dict['out_prefix'] = args.out_prefix.replace('{timestamp}', get_timestamp()) - args_dict['normalize_by_hits'] = args.normalize_by_hits.lower() in ['t', 'true'] + for tf in ('normalize_by_feature_hits', 'normalize_by_genomic_hits'): + args_dict[tf] = args_dict[tf].lower() in ['t', 'true'] return ReadOnlyDict(args_dict) @@ -221,11 +224,11 @@ def map_and_reduce(libraries, paths, prefs): with mp.Pool(len(libraries)) as pool: async_results = pool.imap_unordered(counter.count_reads, libraries) - for stats_result in async_results: - summary.add_library(stats_result) + for result in async_results: + summary.add_library_stats(result) else: # Only one library, multiprocessing not beneficial for task - summary.add_library(counter.count_reads(libraries[0])) + summary.add_library_stats(counter.count_reads(libraries[0])) return summary From 2c9c17aa7f10fa9f350dfa98d6f49f13ef02107a Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 08:51:35 -0700 Subject: [PATCH 03/34] Updating the MergedStat abstract base class to include a finalize() abstract method. The goal is to separate the finalization steps from write_output_logfile() leaving minimal calls to whichever writing function is used. This is more inline with the function's name and allows the stats objects to be validated before writing output files --- tiny/rna/counter/statistics.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 61c4352b..40e31b44 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -114,10 +114,13 @@ class MergedStat(ABC): prefix = None @abstractmethod - def add_library(self, other: LibraryStats): pass + def add_library(self, other: LibraryStats): ... @abstractmethod - def write_output_logfile(self): pass + def finalize(self): ... + + @abstractmethod + def write_output_logfile(self): ... @staticmethod def add_warning(msg): @@ -160,6 +163,15 @@ def __init__(self, Features_obj, features_csv, prefs): self.merged_stats.append(MergedDiags()) def write_report_files(self) -> None: + try: + for in_process in self.merged_stats: + in_process.finalize() + + # Validate stats now that all libraries have been processed + StatisticsValidator(self.merged_stats).validate() + finally: + self.print_warnings() + for output_stat in self.merged_stats: output_stat.write_output_logfile() self.print_warnings() From 37574d5b5afdaf23b3cd7170696b9cf01c0b6864 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 08:59:14 -0700 Subject: [PATCH 04/34] Refactoring FeatureCounts to use the new ABC methods --- tiny/rna/counter/statistics.py | 48 ++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 40e31b44..fad4bbbc 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -200,33 +200,28 @@ class FeatureCounts(MergedStat): def __init__(self, Features_obj): self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.classes.values())) self.aliases = Features_obj.aliases + self.finalized = False self.norm_prefs = {} def add_library(self, other: LibraryStats) -> None: + assert not self.finalized, "Cannot add libraries after FeatureCounts object has been finalized." self.feat_counts_df[other.library["Name"]] = self.feat_counts_df.index.map(other.feat_counts) self.norm_prefs[other.library["Name"]] = other.library['Norm'] def write_output_logfile(self) -> None: - fc = self.write_feature_counts() - self.write_norm_counts(fc) + self.write_feature_counts() + self.write_norm_counts() - def write_feature_counts(self) -> pd.DataFrame: - """Writes selected features and their associated counts to {prefix}_out_feature_counts.csv + def finalize(self): + """Called once all libraries have been counted and added to the FeatureCounts object. - If a features.csv rule defined an 'Alias by...' other than "ID", then the associated features will - be aliased to the value associated with this key and displayed in the Feature Name column, and - the feature's true "ID" will be indicated in the Feature ID column. If multiple aliases exist for - a feature then they will be joined by ", " in the Feature Name column. A Feature Class column also - follows. + This method will: + - Name the multi-index columns: "Feature ID" and "Classifier" + - Add a Feature Name column containing the values associated with user-defined alias tags + - Sort columns by name and round decimal values to 2 places. - For example, if the rule contained an 'Alias by...' which aliases gene1 to abc123,def456,123.456 - and is both ALG and CSR class, then the Feature ID, Feature Name, and Feature Class column of the - output file for this feature will read: - gene1, "abc123,def456,123.456", "ALG,CSR" - - Subsequent columns represent the counts from each library processed by Counter. Column titles are - formatted for each library as {group}_rep_{replicate} - """ + Aliases are concatenated by ", " if multiple exist for a feature. Alias values are not + added for the "ID" tag as the Feature ID column already contains this information.""" # Sort columns by title and round all counts to 2 decimal places summary = self.sort_cols_and_round(self.feat_counts_df) @@ -235,14 +230,27 @@ def write_feature_counts(self) -> pd.DataFrame: # Sort by index, make index its own column, and rename it to Feature ID summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Classifier"}) - summary.to_csv(self.prefix + '_feature_counts.csv', index=False) - return summary + self.feat_counts_df = summary + self.finalized = True + + def write_feature_counts(self): + """Writes selected features and their associated counts to {prefix}_feature_counts.csv + Should be called after finalize()""" + + assert self.finalized, "FeatureCounts object must be finalized before writing output." + self.feat_counts_df.to_csv(self.prefix + '_feature_counts.csv', index=False) + + def write_norm_counts(self): + """Writes normalized feature counts to {prefix}_norm_counts.csv + Normalization method is determined by Samples Sheet's Normalization column. + Should be called after finalize()""" - def write_norm_counts(self, counts: pd.DataFrame): + assert self.finalized, "FeatureCounts object must be finalized before writing output." if all([norm in ['1', "", None] for norm in self.norm_prefs.values()]): return + counts = self.feat_counts_df.copy() for library, norm in self.norm_prefs.items(): if norm in ['1', "", None]: continue From 9f33018cba9bfc0311cb08af18bd9a7850e9a06d Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 09:04:29 -0700 Subject: [PATCH 05/34] Refactoring RuleCounts to use the new ABC methods. Also cleaning up the code used for reading Features Sheet rules. It will be useful for statistics.py to have a rudimentary copy of the rules table to work with for validation. The get_inverted_classifiers() function returns a dictionary of {Classifier: [rule, indexes]} so that we can look up rules by classifier. This will be used to validate feature counts per classifier to make sure the feature counts table agrees with the rule counts table --- tiny/rna/counter/statistics.py | 57 +++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index fad4bbbc..2ca8a694 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -272,31 +272,64 @@ def write_norm_counts(self): class RuleCounts(MergedStat): def __init__(self, features_csv): + self.rules = self.read_features_sheet(features_csv) self.rule_counts_df = pd.DataFrame() - self.features_csv = features_csv + self.finalized = False def add_library(self, other: LibraryStats): + assert not self.finalized, "Cannot add libraries after RuleCounts object has been finalized." counts = pd.Series(other.rule_counts, name=other.library['Name']) self.rule_counts_df = self.rule_counts_df.join(counts, how='outer') - def write_output_logfile(self): - # Reread the Features Sheet since FeatureSelector.rules_table is processed and less readable - with open(self.features_csv, 'r', encoding='utf-8-sig') as f: - # Convert each CSV row to a string with values labeled by column headers - rules = ['; '.join([ - ': '.join( - [col.replace("...", ""), val]) - for col, val in row.items()]) - for row in csv.DictReader(f)] - + def finalize(self): + rules = self.get_rule_strings() self.rule_counts_df = self.sort_cols_and_round(self.rule_counts_df) - order = ["Rule String"] + self.rule_counts_df.columns.to_list() + # Add Rule String column to the left of the counts + order = ["Rule String"] + self.rule_counts_df.columns.to_list() self.rule_counts_df = self.rule_counts_df.join(pd.Series(rules, name="Rule String"), how="outer")[order].fillna(0) + + # Add Mapped Reads row below the counts self.rule_counts_df.loc["Mapped Reads"] = SummaryStats.pipeline_stats_df.loc["Mapped Reads"] + self.finalized = True + def write_output_logfile(self): + assert self.finalized, "RuleCounts object must be finalized before writing output." self.df_to_csv(self.rule_counts_df, "Rule Index", self.prefix, 'counts_by_rule', sort_axis=None) + def read_features_sheet(self, features_csv): + """Reads the Features Sheet and returns a list of rules in + the same order as the FeatureSelector.rules_table. We don't use the + CSVReader class here because it returns rows with shortened column names, + and we want to preserve the full column names for the Rule String column + of the output table.""" + + with open(features_csv, 'r', encoding='utf-8-sig') as f: + return [row for row in csv.DictReader(f)] + + def get_inverted_classifiers(self): + """Returns a dictionary of classifiers and the indices of the rules + where they are defined. Note the use of the user-facing column name + rather than the internal shortname for "Classify as..." since the + CSVReader class wasn't used.""" + + inverted_classifiers = defaultdict(list) + for i, rule in enumerate(self.rules): + inverted_classifiers[rule['Classify as...']].append(i) + + return dict(inverted_classifiers) + + def get_rule_strings(self): + """Returns a list of formatted strings representing + each rule in the Features Sheet.""" + + rows = [] + for rule in self.rules: + row = [': '.join([col.replace('...', ''), defn]) + for col, defn in rule.items()] + rows.append('; '.join(row)) + return rows + class NtLenMatrices(MergedStat): def __init__(self): From fc967d2f50cfd8ee6a1b71635071dcf7e88ebb63 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 09:10:43 -0700 Subject: [PATCH 06/34] Refactoring NtLenMatrices to use the new ABC methods. --- tiny/rna/counter/statistics.py | 42 ++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 2ca8a694..f0920ff5 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -334,30 +334,33 @@ def get_rule_strings(self): class NtLenMatrices(MergedStat): def __init__(self): self.nt_len_matrices = {} + self.finalized = False def add_library(self, other: LibraryStats): + assert not self.finalized, "Cannot add libraries after NtLenMatrices object has been finalized." + name = other.library['Name'] - self.nt_len_matrices[name] = [ + self.nt_len_matrices[name] = ( other.mapped_nt_len, other.assigned_nt_len - ] + ) - def write_output_logfile(self): - """Writes each library's 5' end nucleotide / length matrices to their own files.""" + def finalize(self): + self.nt_len_matrices = dict(sorted(self.nt_len_matrices.items())) + for lib_name, (mapped, assigned) in self.nt_len_matrices.items(): + mapped_nt_len_df = self.finalize_mapped(mapped) + assigned_nt_len_df = self.finalize_assigned(assigned) + self.nt_len_matrices[lib_name] = (mapped_nt_len_df, assigned_nt_len_df) - for lib_name, matrices in self.nt_len_matrices.items(): - sanitized_lib_name = lib_name.replace('/', '_') - self.write_mapped_len_dist(matrices[0], sanitized_lib_name) - self.write_assigned_len_dist(matrices[1], sanitized_lib_name) + self.finalized = True - def write_mapped_len_dist(self, matrix, sanitized_lib_name): + def finalize_mapped(self, mapped: DefaultDict[str, Counter]) -> pd.DataFrame: # Whole number counts because number of reads mapped is always integer - mapped_nt_len_df = pd.DataFrame(matrix).sort_index().fillna(0) - mapped_nt_len_df.to_csv(f'{self.prefix}_{sanitized_lib_name}_mapped_nt_len_dist.csv') + return pd.DataFrame(mapped).sort_index().fillna(0) - def write_assigned_len_dist(self, matrix, sanitized_lib_name): + def finalize_assigned(self, assigned: DefaultDict[str, Counter]) -> pd.DataFrame: # Fractional counts due to (loci count) and/or (assigned feature count) normalization - assigned_nt_len_df = pd.DataFrame(matrix).sort_index().round(decimals=2) + assigned_nt_len_df = pd.DataFrame(assigned).sort_index().round(decimals=2) # Drop non-nucleotide columns if they don't contain counts assigned_nt_len_df.drop([ @@ -366,8 +369,17 @@ def write_assigned_len_dist(self, matrix, sanitized_lib_name): ], axis='columns', inplace=True) # Assign default count of 0 for remaining cases - assigned_nt_len_df.fillna(0, inplace=True) - assigned_nt_len_df.to_csv(f'{self.prefix}_{sanitized_lib_name}_assigned_nt_len_dist.csv') + return assigned_nt_len_df.fillna(0) + + def write_output_logfile(self): + """Writes each library's 5' end nucleotide / length matrices to separate files.""" + + assert self.finalized, "NtLenMatrices object must be finalized before writing output." + + for lib_name, (mapped, assigned) in self.nt_len_matrices.items(): + sanitized_lib_name = lib_name.replace('/', '_') + mapped.to_csv(f'{self.prefix}_{sanitized_lib_name}_mapped_nt_len_dist.csv') + assigned.to_csv(f'{self.prefix}_{sanitized_lib_name}_assigned_nt_len_dist.csv') class AlignmentStats(MergedStat): From 8d095cb7b1a8af1e20acd654fb801bdcb2c85d45 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 09:11:37 -0700 Subject: [PATCH 07/34] Refactoring AlignmentStats to use the new ABC methods. --- tiny/rna/counter/statistics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index f0920ff5..f4b6c8eb 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -385,12 +385,18 @@ def write_output_logfile(self): class AlignmentStats(MergedStat): def __init__(self): self.alignment_stats_df = pd.DataFrame(index=LibraryStats.summary_categories) + self.finalized = False def add_library(self, other: LibraryStats): + assert not self.finalized, "Cannot add libraries after AlignmentStats object has been finalized." library, stats = other.library['Name'], other.library_stats self.alignment_stats_df[library] = self.alignment_stats_df.index.map(stats) + def finalize(self): + self.finalized = True # Nothing else to do + def write_output_logfile(self): + assert self.finalized, "AlignmentStats object must be finalized before writing output." self.df_to_csv(self.alignment_stats_df, "Alignment Statistics", self.prefix, 'alignment_stats') From 38db0d267d362d46b9ef834392c54c6ca814bb63 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 09:14:11 -0700 Subject: [PATCH 08/34] Refactoring SummaryStats to use the new ABC methods. Also changing method order so that get_mapped_seqs/reads is listed near the other stats calculations. --- tiny/rna/counter/statistics.py | 39 ++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index f4b6c8eb..e31348b7 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -411,10 +411,13 @@ class SummaryStats(MergedStat): def __init__(self): self.missing_fastp_outputs = [] self.missing_collapser_outputs = [] + self.finalized = False def add_library(self, other: LibraryStats): """Add incoming summary stats as new column in the master table""" + assert not self.finalized, "Cannot add libraries after SummaryStats object has been finalized." + other.library['basename'] = self.get_library_basename(other) other_summary = { "Mapped Sequences": self.get_mapped_seqs(other), @@ -435,7 +438,22 @@ def add_library(self, other: LibraryStats): self.pipeline_stats_df[other.library["Name"]] = self.pipeline_stats_df.index.map(other_summary) - def write_output_logfile(self): + @staticmethod + def get_mapped_seqs(other: LibraryStats): + return sum([other.library_stats[stat] for stat in [ + "Total Assigned Sequences", + "Total Unassigned Sequences" + ]]) + + @staticmethod + def get_mapped_reads(other: LibraryStats): + return sum([other.library_stats[stat] for stat in [ + 'Reads Assigned to Multiple Features', + 'Reads Assigned to Single Feature', + 'Total Unassigned Reads' + ]]) + + def finalize(self): if len(self.missing_fastp_outputs): missing = '\n\t'.join(self.missing_fastp_outputs) self.add_warning("Total Reads and Retained Reads could not be determined for the following libraries " @@ -448,7 +466,11 @@ def write_output_logfile(self): # Only display conditional categories if they were collected for at least one library empty_rows = self.pipeline_stats_df.loc[self.conditional_categories].isna().all(axis='columns') self.pipeline_stats_df.drop(empty_rows.index[empty_rows], inplace=True) + self.pipeline_stats_df = self.sort_cols_and_round(self.pipeline_stats_df) + self.finalized = True + def write_output_logfile(self): + assert self.finalized, "SummaryStats object must be finalized before writing output." self.df_to_csv(self.pipeline_stats_df, "Summary Statistics", self.prefix, "summary_stats") def library_has_collapser_outputs(self, other: LibraryStats) -> bool: @@ -518,21 +540,6 @@ def get_library_basename(other: LibraryStats) -> str: lib_basename = sam_basename.replace("_aligned_seqs", "") return lib_basename - @staticmethod - def get_mapped_seqs(other: LibraryStats): - return sum([other.library_stats[stat] for stat in [ - "Total Assigned Sequences", - "Total Unassigned Sequences" - ]]) - - @staticmethod - def get_mapped_reads(other: LibraryStats): - return sum([other.library_stats[stat] for stat in [ - 'Reads Assigned to Multiple Features', - 'Reads Assigned to Single Feature', - 'Total Unassigned Reads' - ]]) - class Diagnostics: From e776d16bfa9cdcfc2f230548b449e4d2d5343af1 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 6 Apr 2023 12:33:25 -0700 Subject: [PATCH 09/34] Making corrections to the NT/length stats to make them compatible with disabled normalization. Also introducing some strategic rounding to prevent floating point error accumulation. It turns out this can be quite significant given enough alignments, and it grows unpredictably. It costs a small amount of runtime but allows stats to be validated and internally consistent --- tiny/rna/counter/statistics.py | 68 +++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index e31348b7..1ba97f1e 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -45,18 +45,17 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) - corr_counts = read_counts / loci_counts if self.norm_gh else read_counts + corr_counts = round(read_counts / loci_counts, 2) if self.norm_gh else read_counts nt5, seqlen = bundle_read['nt5end'], bundle_read['Length'] - # Fill in 5p nt/length matrix - self.mapped_nt_len[nt5][seqlen] += read_counts - return { 'read_count': read_counts, 'corr_count': corr_counts, 'assigned_ftags': set(), - 'assigned_reads': 0, - 'nt5_counts': self.assigned_nt_len[nt5], + 'assigned_reads': 0.0, + 'unassigned_reads': 0.0, + 'nt5_assigned': self.assigned_nt_len[nt5], + 'nt5_mapped': self.mapped_nt_len[nt5], 'seq_length': seqlen, 'mapping_stat': "Assigned Single-Mapping Reads" @@ -71,9 +70,9 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n corr_count = bundle['corr_count'] if asgn_count == 0: - self.library_stats['Total Unassigned Reads'] += corr_count + bundle['unassigned_reads'] += corr_count else: - fcorr_count = corr_count / asgn_count if self.norm_fh else corr_count + fcorr_count = round(corr_count / asgn_count, 2) if self.norm_fh else corr_count bundle['assigned_reads'] += fcorr_count * asgn_count bundle['assigned_ftags'] |= assignments.keys() @@ -96,11 +95,16 @@ def finalize_bundle(self, bundle: dict) -> None: self.library_stats['Total Unassigned Sequences'] += 1 else: self.library_stats['Total Assigned Sequences'] += 1 - assigned_reads = bundle['assigned_reads'] + assigned_reads = round(bundle['assigned_reads'], 2) + unassigned_reads = round(bundle['unassigned_reads'], 2) + self.library_stats['Total Unassigned Reads'] += unassigned_reads self.library_stats['Total Assigned Reads'] += assigned_reads self.library_stats[bundle['mapping_stat']] += assigned_reads - bundle['nt5_counts'][bundle['seq_length']] += assigned_reads + + seqlen = bundle['seq_length'] + bundle['nt5_assigned'][seqlen] += assigned_reads + bundle['nt5_mapped'][seqlen] += assigned_reads + unassigned_reads if assigned_feat_count == 1: self.library_stats['Reads Assigned to Single Feature'] += assigned_reads @@ -156,7 +160,7 @@ def __init__(self, Features_obj, features_csv, prefs): self.merged_stats = [ FeatureCounts(Features_obj), RuleCounts(features_csv), AlignmentStats(), SummaryStats(), - NtLenMatrices() + NtLenMatrices(prefs) ] if prefs.get('report_diags', False): @@ -332,8 +336,10 @@ def get_rule_strings(self): class NtLenMatrices(MergedStat): - def __init__(self): + def __init__(self, prefs): self.nt_len_matrices = {} + self.norm_gh = prefs.get('normalize_by_genomic_hits', True) + self.norm_fh = prefs.get('normalize_by_feature_hits', True) self.finalized = False def add_library(self, other: LibraryStats): @@ -355,12 +361,11 @@ def finalize(self): self.finalized = True def finalize_mapped(self, mapped: DefaultDict[str, Counter]) -> pd.DataFrame: - # Whole number counts because number of reads mapped is always integer - return pd.DataFrame(mapped).sort_index().fillna(0) + return pd.DataFrame(mapped).sort_index().fillna(0).round(decimals=2) def finalize_assigned(self, assigned: DefaultDict[str, Counter]) -> pd.DataFrame: # Fractional counts due to (loci count) and/or (assigned feature count) normalization - assigned_nt_len_df = pd.DataFrame(assigned).sort_index().round(decimals=2) + assigned_nt_len_df = pd.DataFrame(assigned, dtype='float64').sort_index().round(decimals=2) # Drop non-nucleotide columns if they don't contain counts assigned_nt_len_df.drop([ @@ -378,8 +383,37 @@ def write_output_logfile(self): for lib_name, (mapped, assigned) in self.nt_len_matrices.items(): sanitized_lib_name = lib_name.replace('/', '_') - mapped.to_csv(f'{self.prefix}_{sanitized_lib_name}_mapped_nt_len_dist.csv') - assigned.to_csv(f'{self.prefix}_{sanitized_lib_name}_assigned_nt_len_dist.csv') + self._write_mapped(mapped, sanitized_lib_name) + self._write_assigned(assigned, sanitized_lib_name) + + def _write_mapped(self, mapped: pd.DataFrame, lib_name: str): + """Writes the mapped matrix to a file. + + Conceptually, mapped reads should always be a whole number. If either + normalization step is disabled, these counts will be fractional. However, + even when both normalization steps are enabled, we still get fractional + counts that are very close to a whole number due to the way these counts + are tallied. They are fractional up to this point for the validation step. + + Floating point error is at the root of this issue. If in LibraryStats we + instead increment these counts by the raw read count, then the matrix's + sum will disagree with the Mapped Reads summary stat because the constituent + stats of Mapped Reads are based on rounded values. Rounding maintains the + finite representation of these values. If we didn't round them and allowed + floating point error to accumulate over potentially millions+ of alignments, + then the disagreement between all stats would grow unpredictably and there + wouldn't be a meaningful way to validate them.""" + + if self.norm_gh ^ self.norm_fh: + mapped.to_csv(f'{self.prefix}_{lib_name}_mapped_nt_len_dist.csv') + else: + mapped_whole_numbers = mapped.astype('int64') + mapped_whole_numbers.to_csv(f'{self.prefix}_{lib_name}_mapped_nt_len_dist.csv') + + def _write_assigned(self, assigned: pd.DataFrame, lib_name: str): + """Writes the assigned matrix to a file (no further work required).""" + + assigned.to_csv(f'{self.prefix}_{lib_name}_assigned_nt_len_dist.csv') class AlignmentStats(MergedStat): From fe61785280efcc0bce7f213e05319fdac1e9c9c7 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 18:36:29 -0700 Subject: [PATCH 10/34] Introducing the StatisticsValidator class. It performs a thorough evaluation of the internal consistency of all reported stats, between output files and in some cases within. I've made an effort to write this class in a way that it won't bring the rest of tiny-count down if it encounters an error. Validation is performed in a floating-point-error-conscious way. Issues are reported but not treated as errors. --- tiny/rna/counter/counter.py | 4 +- tiny/rna/counter/statistics.py | 156 +++++++++++++++++++++++++++++++-- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 4b3b0648..d8d4b62c 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -63,6 +63,8 @@ def get_args(): optional_args.add_argument('-sv', '--stepvector', choices=['Cython', 'HTSeq'], default='Cython', help='Select which StepVector implementation is used to find ' 'features overlapping an interval.') + optional_args.add_argument('-vs', '--verify-stats', metavar='T/F', default='T', + help='If T/true, verify that all reported stats are internally consistent.') optional_args.add_argument('-a', '--all-features', action='store_true', help=argparse.SUPPRESS) #help='Represent all features in output counts table, ' # 'even if they did not match in Stage 1 selection.') @@ -82,7 +84,7 @@ def get_args(): else: args_dict = vars(args) args_dict['out_prefix'] = args.out_prefix.replace('{timestamp}', get_timestamp()) - for tf in ('normalize_by_feature_hits', 'normalize_by_genomic_hits'): + for tf in ('normalize_by_feature_hits', 'normalize_by_genomic_hits', 'verify_stats'): args_dict[tf] = args_dict[tf].lower() in ['t', 'true'] return ReadOnlyDict(args_dict) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 1ba97f1e..daf9500a 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -1,4 +1,5 @@ import pandas as pd +import traceback import gzip import json import mmap @@ -8,11 +9,11 @@ import re from abc import abstractmethod, ABC -from typing import Tuple, Optional, Union +from typing import Tuple, Optional, Union, List, DefaultDict from collections import Counter, defaultdict from glob import glob -from ..util import make_filename +from ..util import make_filename, report_execution_time class LibraryStats: @@ -156,6 +157,8 @@ class MergedStatsManager: def __init__(self, Features_obj, features_csv, prefs): MergedStat.prefix = prefs.get('out_prefix', '') + self.verify_stats = prefs.get('verify_stats') + self.prefs = prefs self.merged_stats = [ FeatureCounts(Features_obj), RuleCounts(features_csv), @@ -166,19 +169,20 @@ def __init__(self, Features_obj, features_csv, prefs): if prefs.get('report_diags', False): self.merged_stats.append(MergedDiags()) + @report_execution_time('Validating and writing report files') def write_report_files(self) -> None: try: for in_process in self.merged_stats: in_process.finalize() - # Validate stats now that all libraries have been processed - StatisticsValidator(self.merged_stats).validate() + if self.verify_stats: + # Validate stats now that all libraries have been processed + StatisticsValidator(self.merged_stats).validate() finally: self.print_warnings() for output_stat in self.merged_stats: output_stat.write_output_logfile() - self.print_warnings() def add_library(self, other: LibraryStats) -> None: for merger in self.merged_stats: @@ -427,7 +431,8 @@ def add_library(self, other: LibraryStats): self.alignment_stats_df[library] = self.alignment_stats_df.index.map(stats) def finalize(self): - self.finalized = True # Nothing else to do + self.alignment_stats_df = self.sort_cols_and_round(self.alignment_stats_df) + self.finalized = True def write_output_logfile(self): assert self.finalized, "AlignmentStats object must be finalized before writing output." @@ -659,3 +664,142 @@ def write_selection_diags(self): selection_summary_filename = make_filename([self.prefix, 'selection_diags'], ext='.txt') with open(selection_summary_filename, 'w') as f: f.write('\n'.join(out)) + + +class StatisticsValidator: + + filename_suffix = 'stats_check.csv' + + def __init__(self, merged_stats: List[MergedStat]): + self.stats = {type(stat).__name__: stat for stat in merged_stats} + self.prefix = MergedStat.prefix + + def validate(self): + """Check the counts reported in all MergedStat objects for internal consistency. + This step should be allowed to fail without preventing outputs from being written.""" + + try: + self.validate_stats(self.stats['AlignmentStats'], self.stats['SummaryStats']) + self.validate_counts(self.stats['FeatureCounts'], self.stats['RuleCounts'], self.stats['SummaryStats']) + self.validate_nt_len_matrices(self.stats['NtLenMatrices'], self.stats['SummaryStats']) + except Exception as e: + print(traceback.format_exc(), file=sys.stderr) + MergedStat.add_warning("Error validating statistics: " + str(e)) + + def validate_stats(self, aln_stats, sum_stats): + # SummaryStats categories + MS, MR, AR = "Mapped Sequences", "Mapped Reads", "Assigned Reads" + + # AlignmentStats categories + TAR, TUR = "Total Assigned Reads", "Total Unassigned Reads" + TAS, TUS = "Total Assigned Sequences", "Total Unassigned Sequences" + ASMR, AMMR = "Assigned Single-Mapping Reads", "Assigned Multi-Mapping Reads" + RASF, RAMF = "Reads Assigned to Single Feature", "Reads Assigned to Multiple Features" + SASF, SAMF = "Sequences Assigned to Single Feature", "Sequences Assigned to Multiple Features" + + a_df = aln_stats.alignment_stats_df + s_df = sum_stats.pipeline_stats_df + table = pd.concat([s_df.loc[[MR, MS], :], a_df]) + + res = pd.DataFrame(columns=a_df.columns) + res.loc["TAS - (SASF + SAMF)"] = table.loc[TAS] - (table.loc[SASF] + table.loc[SAMF]) + res.loc["TAR - (ASMR + AMMR)"] = table.loc[TAR] - (table.loc[ASMR] + table.loc[AMMR]) + res.loc["TAR - (RASF + RAMF)"] = table.loc[TAR] - (table.loc[RASF] + table.loc[RAMF]) + res.loc["MS - (TUS + TAS)"] = table.loc[MS] - (table.loc[TUS] + table.loc[TAS]) + res.loc["MS - (TUS + SASF + SAMF)"] = table.loc[MS] - (table.loc[TUS] + table.loc[SASF] + table.loc[SAMF]) + res.loc["MR - (TUR + TAR)"] = table.loc[MR] - (table.loc[TUR] + table.loc[TAR]) + res.loc["MR - (TUR + RASF + RAMF)"] = table.loc[MR] - (table.loc[TUR] + table.loc[RASF] + table.loc[RAMF]) + res.loc["MR - (TUR + ASMR + AMMR)"] = table.loc[MR] - (table.loc[TUR] + table.loc[ASMR] + table.loc[AMMR]) + res.loc["Sum"] = res.sum(axis=0) + res = res.round(decimals=2) + + if not self.approx_equal(a_df.loc[TAR], s_df.loc[AR]): + MergedStat.add_warning("Alignment stats and summary stats disagree on the number of assigned reads.") + MergedStat.add_warning(self.indent_table(a_df.loc[TAR], s_df.loc[AR], index=("Alignment Stats", "Summary Stats"))) + + if not self.approx_equal(res.loc["Sum"].sum(), 0): + MergedStat.add_warning("Unexpected disagreement between alignment and summary statistics.") + MergedStat.add_warning(f"See the checksum table ({self.filename_suffix})") + MergedStat.df_to_csv(res, "Checksum", self.prefix, self.filename_suffix) + + def validate_counts(self, feat_counts, rule_counts, sum_stats): + f_df = feat_counts.feat_counts_df + r_df = rule_counts.rule_counts_df + s_df = sum_stats.pipeline_stats_df + s_sum = s_df.loc["Assigned Reads"] + + # Get sum of assigned counts in the RuleCounts table + r_rows = r_df.index.difference(["Mapped Reads"], sort=False) + r_cols = r_df.columns.difference(["Rule Index", "Rule String"], sort=False) + r_df_sum = r_df.loc[r_rows, r_cols].sum(axis=0) + r_df_mapped = r_df.loc["Mapped Reads", r_cols] + + # Get sum of assigned counts in the FeatureCounts table + f_cols = f_df.columns.difference(["Feature ID", "Classifier", "Feature Name"], sort=False) + f_df_sum = f_df[f_cols].sum(axis=0) + + # Compare assigned counts in FeatureCounts, RuleCounts, and SummaryStats + if not self.approx_equal(r_df_sum, f_df_sum): + MergedStat.add_warning("Rule counts and feature counts disagree on the number of assigned reads:") + MergedStat.add_warning(self.indent_table(r_df_sum, f_df_sum, index=("Rule Counts", "Feature Counts"))) + if not self.approx_equal(r_df_sum, s_sum): + MergedStat.add_warning("Rule counts and summary stats disagree on the number of assigned reads:") + MergedStat.add_warning(self.indent_table(r_df_sum, s_sum, index=("Rule Counts", "Summary Stats"))) + if r_df_sum.gt(r_df_mapped).any(): + MergedStat.add_warning("Rule counts reports assigned reads > mapped reads.") + MergedStat.add_warning(self.indent_table(r_df_sum, r_df_mapped, index=("Assigned", "Mapped"))) + + # Compare counts per classifier in FeatureCounts to corresponding rules in RuleCounts + f_cols = f_df.columns.difference(["Feature ID", "Feature Name"], sort=False) + f_cls_sums = f_df[f_cols].groupby("Classifier").sum() + r_cls_idxs = rule_counts.get_inverted_classifiers() + for cls, counts in f_cls_sums.iterrows(): + f_cls_sum = f_cls_sums.loc[cls].sum() + r_cls_sum = sum(r_df.loc[rule, r_cols].sum() for rule in r_cls_idxs[cls]) + if not self.approx_equal(f_cls_sum, r_cls_sum): + MergedStat.add_warning(f'Feature counts and rule counts disagree on counts for the "{cls}" classifier.') + MergedStat.add_warning(self.indent_vs(f_cls_sum, r_cls_sum)) + + def validate_nt_len_matrices(self, matrices, sum_stats): + s_df = sum_stats.pipeline_stats_df + w_mapped = "Length matrix for {} disagrees with summary stats on the number of mapped reads." + w_assigned = "Length matrix for {} disagrees with summary stats on the number of assigned reads." + + for lib_name, (mapped, assigned) in matrices.nt_len_matrices.items(): + m_sum = mapped.sum().sum() + a_sum = assigned.sum().sum() + if not self.approx_equal(m_sum, s_df.loc["Mapped Reads", lib_name]): + MergedStat.add_warning(w_mapped.format(lib_name)) + MergedStat.add_warning(self.indent_vs(m_sum, s_df.loc['Mapped Reads', lib_name])) + if not self.approx_equal(a_sum, s_df.loc["Assigned Reads", lib_name]): + MergedStat.add_warning(w_assigned.format(lib_name)) + MergedStat.add_warning(self.indent_vs(a_sum, s_df.loc['Assigned Reads', lib_name])) + + @staticmethod + def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=0.01): + """Tolerate small differences in floating point numbers due to floating point error. + The error tends to be greater for stats that are incremented many times by small + amounts. The default tolerance is definitely too generous for the error, + but seems conceptually appropriate for read counts.""" + + approx = abs(x - y) < tolerance + if isinstance(approx, pd.Series): + approx = approx.all() + + return approx + + @staticmethod + def indent_table(*series: pd.Series, index: tuple, indent=1): + """Joins the series into a table and adds its string + representation to the warning log with each line indented.""" + + table_str = str(pd.DataFrame(tuple(series), index=index)) + table_ind = '\n'.join('\t' * indent + line for line in table_str.splitlines()) + return table_ind + + @staticmethod + def indent_vs(stat_a: Union[int, float], stat_b: Union[int, float], indent=1): + """Simply indents the string representation of two numbers being compared + and formats them to two decimal places.""" + + return '\t' * indent + f"{stat_a:.2f} vs {stat_b:.2f}" From b7195e2d87de2dbf27eac86bf1b2ae78ecb59559 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 18:41:08 -0700 Subject: [PATCH 11/34] Detail and readability improvements --- tiny/rna/counter/counter.py | 4 ++-- tiny/rna/counter/statistics.py | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index d8d4b62c..b6967930 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -55,8 +55,8 @@ def get_args(): 'substring {timestamp} will be replaced with the current date and time.') optional_args.add_argument('-ng', '--normalize-by-genomic-hits', metavar='T/F', default='T', help='If T/true, normalize counts by genomic hits.') - optional_args.add_argument('-nh', '--normalize-by-feature-hits', metavar='T/F', default='T', - help='If T/true, normalize counts by (selected) overlapping feature counts.') + optional_args.add_argument('-nf', '--normalize-by-feature-hits', metavar='T/F', default='T', + help='If T/true, normalize counts by feature hits.') optional_args.add_argument('-dc', '--decollapse', action='store_true', help='Create a decollapsed copy of all SAM files listed in your Samples Sheet. ' 'This option is ignored for non-collapsed inputs.') diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index daf9500a..8ebe3964 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -24,10 +24,9 @@ class LibraryStats: 'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature', 'Reads Assigned to Multiple Features', 'Sequences Assigned to Multiple Features'] - def __init__(self, out_prefix: str = None, report_diags: bool = False, **prefs): + def __init__(self, **prefs): self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'} - self.out_prefix = out_prefix - self.diags = Diagnostics(out_prefix) if report_diags else None + self.diags = Diagnostics(**prefs) if prefs.get('report_diags') else None self.norm_gh = prefs.get('normalize_by_genomic_hits', True) self.norm_fh = prefs.get('normalize_by_feature_hits', True) @@ -184,7 +183,7 @@ def write_report_files(self) -> None: for output_stat in self.merged_stats: output_stat.write_output_logfile() - def add_library(self, other: LibraryStats) -> None: + def add_library_stats(self, other: LibraryStats) -> None: for merger in self.merged_stats: merger.add_library(other) self.chrom_misses.update(other.chrom_misses) @@ -196,8 +195,8 @@ def print_warnings(self): self.warnings.append("between your bowtie indexes and GFF files, or your bowtie indexes may") self.warnings.append("contain chromosomes not defined in your GFF files.") self.warnings.append("The chromosome names and their alignment counts are:") - for chr in sorted(self.chrom_misses.keys()): - self.warnings.append("\t" + chr + ": " + str(self.chrom_misses[chr])) + for chrom in sorted(self.chrom_misses.keys()): + self.warnings.append("\t" + chrom + ": " + str(self.chrom_misses[chrom])) self.warnings.append("\n") for warning in self.warnings: @@ -635,13 +634,15 @@ def add_library(self, other: LibraryStats): self.selection_diags[other_lib] = other.diags.selection_diags self.aln_diags.loc[other_lib] = other.diags.alignment_diags + def finalize(self): pass + def write_output_logfile(self): self.write_alignment_diags() # self.write_selection_diags() Not currently collected self.write_alignment_tables() def write_alignment_diags(self): - MergedStat.df_to_csv(self.aln_diags, "Sample", self.prefix, "alignment_diags", sort_axis="index") + self.df_to_csv(self.aln_diags, "Sample", self.prefix, "alignment_diags", sort_axis="index") def write_alignment_tables(self): for library_name, table in self.alignment_tables.items(): From dae09ca69813dc621754d96518b5bf25a9bba303 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 18:59:23 -0700 Subject: [PATCH 12/34] Removing unused parameter from the Diagnostics class --- tiny/rna/counter/statistics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index cb9efa16..4739195d 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -589,8 +589,7 @@ class Diagnostics: complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca') map_strand = {True: '+', False: '-', None: '.'} - def __init__(self, out_prefix: str): - self.prefix = out_prefix + def __init__(self): self.assignment_diags = {stat: 0 for stat in Diagnostics.summary_categories} self.selection_diags = defaultdict(Counter) self.alignments = [] @@ -648,7 +647,7 @@ def add_library(self, other: LibraryStats): # self.selection_diags[other_lib] = other.diags.selection_diags Not currently collected self.assignment_diags[other_lib] = self.assignment_diags.index.map(other.diags.assignment_diags) - def finalize(self): pass + def finalize(self): pass # Nothing to do def write_output_logfile(self): self.write_assignment_diags() From a82c57cd15fdca9deb8b348d2fe772c4d22afda6 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 19:13:51 -0700 Subject: [PATCH 13/34] Version bump to 1.4 Updating the two normalization keys in all Run Configs. Updating compatibility definitions as well. --- START_HERE/run_config.yml | 4 ++-- setup.py | 2 +- tests/testdata/config_files/run_config_template.yml | 10 ++++++---- .../compatibility/run_config_compatibility.yml | 6 ++++++ tiny/templates/run_config_template.yml | 10 ++++++---- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index da0aadc4..8d7d5bc5 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -226,7 +226,7 @@ shared_memory: False counter_all_features: False ##-- If True: counts are normalized by genomic hits (number of multi-alignments) --## -counter_normalize_by_genomic_hits: False +counter_normalize_by_genomic_hits: True ##-- If True: counts are normalized by feature hits (selected feature count per-locus) --## counter_normalize_by_feature_hits: True @@ -328,7 +328,7 @@ dir_name_logs: logs # ########################################################################################### -version: 1.3.0 +version: 1.4.0 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # diff --git a/setup.py b/setup.py index 72089139..cca36882 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ AUTHOR = 'Kristen Brown, Alex Tate' PLATFORM = 'Unix' REQUIRES_PYTHON = '>=3.9.0' -VERSION = '1.3.0' +VERSION = '1.4.0' REQUIRED = [] # Required packages are installed via Conda's environment.yml diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml index 56d86210..e8195a36 100644 --- a/tests/testdata/config_files/run_config_template.yml +++ b/tests/testdata/config_files/run_config_template.yml @@ -225,9 +225,11 @@ shared_memory: False ##-- If True: show all parsed features in the counts csv, regardless of count/identity --## counter_all_features: False -##-- If True: counts will be normalized by genomic hits AND selected feature count --## -##-- If False: counts will only be normalized by genomic hits --## -counter_normalize_by_hits: True +##-- If True: counts are normalized by genomic hits (number of multi-alignments) --## +counter_normalize_by_genomic_hits: True + +##-- If True: counts are normalized by feature hits (selected feature count per-locus) --## +counter_normalize_by_feature_hits: True ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --## counter_decollapse: False @@ -326,7 +328,7 @@ dir_name_logs: logs # ########################################################################################### -version: 1.3.0 +version: 1.4.0 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # diff --git a/tiny/templates/compatibility/run_config_compatibility.yml b/tiny/templates/compatibility/run_config_compatibility.yml index 8b38bc11..91e30785 100644 --- a/tiny/templates/compatibility/run_config_compatibility.yml +++ b/tiny/templates/compatibility/run_config_compatibility.yml @@ -6,6 +6,12 @@ # - Renames are evaluated before additions; preceding_key should use the new name if version renames it +1.4.0: + rename: + - counter_normalize_by_hits: counter_normalize_by_feature_hits + add: + - preceding_key: counter_normalize_by_feature_hits + counter_normalize_by_genomic_hits: True 1.3.0: remove: [] rename: diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 90bc5a28..5eb18add 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -225,9 +225,11 @@ shared_memory: False ##-- If True: show all parsed features in the counts csv, regardless of count/identity --## counter_all_features: False -##-- If True: counts will be normalized by genomic hits AND selected feature count --## -##-- If False: counts will only be normalized by genomic hits --## -counter_normalize_by_hits: True +##-- If True: counts are normalized by genomic hits (number of multi-alignments) --## +counter_normalize_by_genomic_hits: True + +##-- If True: counts are normalized by feature hits (selected feature count per-locus) --## +counter_normalize_by_feature_hits: True ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --## counter_decollapse: False @@ -326,7 +328,7 @@ dir_name_logs: logs # ########################################################################################### -version: 1.3.0 +version: 1.4.0 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # From fe185847e659add6ba915cb6a11ed23193871187 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 19:22:11 -0700 Subject: [PATCH 14/34] Adding the optional stats_check.csv file to the CWL. This file is produced when alignment stats are found to not be internally consistent --- tiny/cwl/tools/tiny-count.cwl | 5 +++++ tiny/cwl/workflows/tinyrna_wf.cwl | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index 069a3b16..aeb2e694 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -145,5 +145,10 @@ outputs: outputBinding: glob: $(inputs.out_prefix)_selection_diags.txt + stats_check: + type: File? + outputBinding: + glob: "*_stats_check.csv" + console_output: type: stdout \ No newline at end of file diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index 567a1faa..0a877b86 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -229,7 +229,7 @@ steps: collapsed_fa: preprocessing/uniq_seqs out: [ feature_counts, rule_counts, norm_counts, mapped_nt_len_dist, assigned_nt_len_dist, alignment_stats, summary_stats, decollapsed_sams, alignment_tables, - assignment_diags, selection_diags ] + assignment_diags, selection_diags, stats_check ] tiny-deseq: run: ../tools/tiny-deseq.cwl @@ -326,7 +326,7 @@ steps: tiny-count/mapped_nt_len_dist, tiny-count/assigned_nt_len_dist, tiny-count/alignment_stats, tiny-count/summary_stats, tiny-count/decollapsed_sams, tiny-count/assignment_diags, tiny-count/selection_diags, tiny-count/alignment_tables, - features_csv ] + tiny-count/stats_check, features_csv ] dir_name: dir_name_tiny-count out: [ subdir ] From 51ffb71345d632e9801e79aa2c537c8029710530 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 7 Apr 2023 19:23:38 -0700 Subject: [PATCH 15/34] Raising the floating point error tolerance in StatisticsValidator.approx_equal() now that testing is complete --- tiny/rna/counter/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 4739195d..c4e9c101 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -26,7 +26,7 @@ class LibraryStats: def __init__(self, **prefs): self.library = {'Name': 'Unassigned', 'File': 'Unassigned', 'Norm': '1'} - self.diags = Diagnostics(**prefs) if prefs.get('report_diags') else None + self.diags = Diagnostics() if prefs.get('report_diags') else None self.norm_gh = prefs.get('normalize_by_genomic_hits', True) self.norm_fh = prefs.get('normalize_by_feature_hits', True) @@ -790,7 +790,7 @@ def validate_nt_len_matrices(self, matrices, sum_stats): MergedStat.add_warning(self.indent_vs(a_sum, s_df.loc['Assigned Reads', lib_name])) @staticmethod - def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=0.01): + def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=1.0): """Tolerate small differences in floating point numbers due to floating point error. The error tends to be greater for stats that are incremented many times by small amounts. The default tolerance is definitely too generous for the error, From f4432ff7ac10e217f51cf4384e5fe54cbbe2c9b3 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 12:51:56 -0700 Subject: [PATCH 16/34] Correcting the calculation of total unassigned reads and mapped nt5/length in finalize_bundle() --- tiny/rna/counter/statistics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index c4e9c101..93214e80 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -89,21 +89,19 @@ def finalize_bundle(self, bundle: dict) -> None: """Called at the conclusion of processing each multiple-alignment bundle""" assigned_feat_count = len({feat[0] for feat in bundle['assigned_ftags']}) + unassigned_reads = round(bundle['unassigned_reads'], 2) + assigned_reads = round(bundle['assigned_reads'], 2) + + self.library_stats['Total Unassigned Reads'] += unassigned_reads + bundle['nt5_mapped'][bundle['seq_length']] += assigned_reads + unassigned_reads if assigned_feat_count == 0: self.library_stats['Total Unassigned Sequences'] += 1 else: self.library_stats['Total Assigned Sequences'] += 1 - assigned_reads = round(bundle['assigned_reads'], 2) - unassigned_reads = round(bundle['unassigned_reads'], 2) - - self.library_stats['Total Unassigned Reads'] += unassigned_reads self.library_stats['Total Assigned Reads'] += assigned_reads self.library_stats[bundle['mapping_stat']] += assigned_reads - - seqlen = bundle['seq_length'] - bundle['nt5_assigned'][seqlen] += assigned_reads - bundle['nt5_mapped'][seqlen] += assigned_reads + unassigned_reads + bundle['nt5_assigned'][bundle['seq_length']] += assigned_reads if assigned_feat_count == 1: self.library_stats['Reads Assigned to Single Feature'] += assigned_reads From 5279a61dd4a77d916d78674d90377b93e86bcb7c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 12:58:53 -0700 Subject: [PATCH 17/34] SummaryStats now reports Mapped Reads as Normalized Mapped Reads and Non-normalized Mapped Reads when the hit normalization steps are disabled. This allows us to report the true mapped read count while still maintaining valid proportion calculations in tiny-plot for rule_charts and class_charts. Otherwise, only the Mapped Read stat is reported when default normalization is used. StatisticsValidator verifies that Normalized Mapped Reads <= Non-normalized Mapped Reads when these stats are reported --- tiny/rna/counter/statistics.py | 50 +++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 93214e80..3fa00673 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -18,7 +18,8 @@ class LibraryStats: - summary_categories = ['Total Assigned Reads', 'Total Unassigned Reads', + summary_categories = ['Mapped Reads Basis', + 'Total Assigned Reads', 'Total Unassigned Reads', 'Total Assigned Sequences', 'Total Unassigned Sequences', 'Assigned Single-Mapping Reads', 'Assigned Multi-Mapping Reads', 'Reads Assigned to Single Feature', 'Sequences Assigned to Single Feature', @@ -47,6 +48,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: loci_counts = len(aln_bundle) corr_counts = round(read_counts / loci_counts, 2) if self.norm_gh else read_counts nt5, seqlen = bundle_read['nt5end'], bundle_read['Length'] + self.library_stats['Mapped Reads Basis'] += read_counts return { 'read_count': read_counts, @@ -118,7 +120,7 @@ class MergedStat(ABC): def add_library(self, other: LibraryStats): ... @abstractmethod - def finalize(self): ... + def finalize(self): ... # Called before validating stats @abstractmethod def write_output_logfile(self): ... @@ -158,7 +160,7 @@ def __init__(self, Features_obj, features_csv, prefs): self.merged_stats = [ FeatureCounts(Features_obj), RuleCounts(features_csv), - AlignmentStats(), SummaryStats(), + AlignmentStats(), SummaryStats(prefs), NtLenMatrices(prefs) ] @@ -427,6 +429,7 @@ def add_library(self, other: LibraryStats): self.alignment_stats_df[library] = self.alignment_stats_df.index.map(stats) def finalize(self): + self.alignment_stats_df.drop(index="Mapped Reads Basis", inplace=True) # Handled by SummaryStats self.alignment_stats_df = self.sort_cols_and_round(self.alignment_stats_df) self.finalized = True @@ -437,15 +440,17 @@ def write_output_logfile(self): class SummaryStats(MergedStat): - constant_categories = ["Mapped Sequences", "Mapped Reads", "Assigned Reads"] conditional_categories = ["Total Reads", "Retained Reads", "Unique Sequences"] - summary_categories = conditional_categories + constant_categories + counted_categories = ["Mapped Sequences", "Normalized Mapped Reads", "Mapped Reads", "Assigned Reads"] + summary_categories = conditional_categories + counted_categories pipeline_stats_df = pd.DataFrame(index=summary_categories) - def __init__(self): - self.missing_fastp_outputs = [] + def __init__(self, prefs): + self.norm_gh = prefs.get('normalize_by_genomic_hits', True) + self.norm_fh = prefs.get('normalize_by_feature_hits', True) self.missing_collapser_outputs = [] + self.missing_fastp_outputs = [] self.finalized = False def add_library(self, other: LibraryStats): @@ -455,11 +460,15 @@ def add_library(self, other: LibraryStats): other.library['basename'] = self.get_library_basename(other) other_summary = { - "Mapped Sequences": self.get_mapped_seqs(other), - "Mapped Reads": self.get_mapped_reads(other), + "Mapped Sequences": self.calculate_mapped_seqs(other), + "Mapped Reads": self.calculate_mapped_reads(other), "Assigned Reads": other.library_stats["Total Assigned Reads"] } + if not (self.norm_gh and self.norm_fh): + norm_mapped = other.library_stats['Mapped Reads Basis'] + other_summary['Normalized Mapped Reads'] = norm_mapped + if self.library_has_fastp_outputs(other): total_reads, retained_reads = self.get_fastp_stats(other) other_summary.update({ @@ -474,14 +483,14 @@ def add_library(self, other: LibraryStats): self.pipeline_stats_df[other.library["Name"]] = self.pipeline_stats_df.index.map(other_summary) @staticmethod - def get_mapped_seqs(other: LibraryStats): + def calculate_mapped_seqs(other: LibraryStats): return sum([other.library_stats[stat] for stat in [ "Total Assigned Sequences", "Total Unassigned Sequences" ]]) @staticmethod - def get_mapped_reads(other: LibraryStats): + def calculate_mapped_reads(other: LibraryStats): return sum([other.library_stats[stat] for stat in [ 'Reads Assigned to Multiple Features', 'Reads Assigned to Single Feature', @@ -506,7 +515,18 @@ def finalize(self): def write_output_logfile(self): assert self.finalized, "SummaryStats object must be finalized before writing output." - self.df_to_csv(self.pipeline_stats_df, "Summary Statistics", self.prefix, "summary_stats") + + if self.norm_gh and self.norm_fh: + # No need to differentiate when default normalization is on + irrelevant = "Normalized Mapped Reads" + out_df = self.pipeline_stats_df.drop(index=irrelevant) + else: + # With normalization steps turned off, the Mapped Reads stat might be + # inflated but necessary for calculating proportions in tiny-plot. + differentiate = {'Mapped Reads': "Non-normalized Mapped Reads"} + out_df = self.pipeline_stats_df.rename(index=differentiate) + + self.df_to_csv(out_df, "Summary Statistics", self.prefix, "summary_stats") def library_has_collapser_outputs(self, other: LibraryStats) -> bool: # Collapser outputs may have been gzipped. Accept either filename. @@ -700,7 +720,7 @@ def validate(self): def validate_stats(self, aln_stats, sum_stats): # SummaryStats categories - MS, MR, AR = "Mapped Sequences", "Mapped Reads", "Assigned Reads" + MS, MR, AR, NMR = "Mapped Sequences", "Mapped Reads", "Assigned Reads", "Normalized Mapped Reads" # AlignmentStats categories TAR, TUR = "Total Assigned Reads", "Total Unassigned Reads" @@ -729,6 +749,10 @@ def validate_stats(self, aln_stats, sum_stats): MergedStat.add_warning("Alignment stats and summary stats disagree on the number of assigned reads.") MergedStat.add_warning(self.indent_table(a_df.loc[TAR], s_df.loc[AR], index=("Alignment Stats", "Summary Stats"))) + if s_df.loc[NMR].gt(s_df.loc[MR]).any(): + MergedStat.add_warning("Summary stats reports normalized mapped reads > non-normalized mapped reads.") + MergedStat.add_warning(self.indent_table(s_df.loc[NMR], s_df.loc(MR), index=("Normalized", "Non-normalized"))) + if not self.approx_equal(res.loc["Sum"].sum(), 0): MergedStat.add_warning("Unexpected disagreement between alignment and summary statistics.") MergedStat.add_warning(f"See the checksum table ({self.filename_suffix})") From aeda3eb8e570befb026865bc411d8820e98a9daf Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 13:00:02 -0700 Subject: [PATCH 18/34] tiny-plot has been updated to use Non-normalized Mapped Reads instead of Mapped Reads when it is reported so that proper proportion calculations can be made for class_charts and rule_charts --- tiny/rna/plotter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py index abcdcf79..1271d325 100644 --- a/tiny/rna/plotter.py +++ b/tiny/rna/plotter.py @@ -256,6 +256,13 @@ def get_proportions_df(counts_df: pd.DataFrame, mapped_totals: pd.Series, un: st def load_mapped_reads(summary_stats_file: str) -> pd.Series: """Produces a Series of mapped reads per library for calculating proportions in class charts. + If normalization by genomic hits and/or feature hits was disabled during the counting run, + then the normal "Mapped Reads" calculation will be an inflated count, so in summary stats + it is differentiated as: + "Normalized Mapped Reads" and "Non-normalized Mapped Reads" + + For proper calculation of proportions with non-normalized counts, the + "Non-normalized Mapped Reads" stat has to be used. Args: summary_stats_file: the summary stats csv produced by tiny-count @@ -263,7 +270,11 @@ def load_mapped_reads(summary_stats_file: str) -> pd.Series: Returns: a Series containing mapped reads per sample """ - return pd.read_csv(summary_stats_file, index_col=0).loc["Mapped Reads",:] + stats = pd.read_csv(summary_stats_file, index_col=0) + if "Mapped Reads" in stats.index: + return stats.loc['Mapped Reads'] + else: + return stats.loc['Non-normalized Mapped Reads'] def get_sample_rep_dict(df: pd.DataFrame) -> dict: From 9d982947c6c97fb81c9c0e762089fefc4eb67188 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 13:00:47 -0700 Subject: [PATCH 19/34] Minor bugfix unrelated to issue-295 --- tiny/rna/configuration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 20c49984..53b7e0fc 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -896,8 +896,8 @@ def check_backward_compatibility(self, header_vals): if 'mismatches' not in header_vals_lc: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", - 'tinyRNA. An additional column, "Mismatches", is now expected. Please review' - "the Stage 2 section in tiny-count's documentation for more info, then add" + 'tinyRNA. An additional column, "Mismatches", is now expected. Please review', + "the Stage 2 section in tiny-count's documentation for more info, then add", "the new column to your Features Sheet to avoid this error." ])) From 23e10795aa2cd14f27c899e1ad68f4c51ac7d365 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 13:18:49 -0700 Subject: [PATCH 20/34] Documentation updates for the new normalization options --- README.md | 9 ++++++++- doc/Configuration.md | 2 +- doc/Parameters.md | 17 ++++++++++++----- doc/tiny-count.md | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 887e5370..0287395b 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ And this feature matched a rule in your Features Sheet defining _Classify as..._ | 406904 | miRNA | mir-1, hsa-miR-1 | 1234 | 999 | ... | #### Normalized Counts -If your Samples Sheet has settings for Normalization, an additional copy of the Feature Counts table is produced with the specified per-library normalizations applied. +If your Samples Sheet has settings for Normalization, an additional copy of the Feature Counts table is produced with the specified per-library normalizations applied. Note that these normalizations are [unrelated to normalization by genomic/feature hits](doc/Configuration.md#applying-custom-normalization). #### Counts by Rule This table shows the counts assigned by each rule on a per-library basis. It is indexed by the rule's corresponding row number in the Features Sheet, where the first non-header row is considered row 0. For convenience a Rule String column is added which contains a human friendly concatenation of each rule. Finally, a Mapped Reads row is added which represents each library's total read counts which were available for assignment prior to counting/selection. @@ -229,6 +229,13 @@ A single table of summary statistics includes columns for each library and the f | Mapped Reads | Total genome-mapping reads passing quality filtering prior to counting/selection | | | Assigned Reads | Total genome-mapping reads passing quality filtering that were assigned to at least one feature due to a rule match in your Features Sheet | +When normalization by feature and/or genomic hits is disabled, the following stats are reported instead of `Mapped Reads`: + +| Stat | Description | +|-----------------------------|----------------------------------------------------------------------------------| +| Normalized Mapped Reads | The true mapped read count | +| Non-normalized Mapped Reads | The sum of assigned and unassigned reads according to the normalization settings | + #### 5'nt vs. Length Matrix During counting, size and 5' nt distribution tables are created for each library. The distribution of lengths and 5' nt can be used to assess the overall quality of your libraries. This can also be used for analyzing small RNA distributions in non-model organisms without annotations. diff --git a/doc/Configuration.md b/doc/Configuration.md index 44026262..26faf28a 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -136,7 +136,7 @@ Supported values are: - **Any number**: the corresponding library's counts are divided by this number (useful for spike-in normalization) - **RPM or rpm**: the corresponding library's counts are divided by (its mapped read count / 1,000,000) ->**NOTE**: These normalizations operate independently of tiny-count's --normalize-by-hits commandline option. The former is concerned with per-library normalization, whereas the latter is concerned with normalization by selected feature count at each locus ([more info](tiny-count.md#count-normalization)). The commandline option does not enable or disable the normalizations detailed above. +>**NOTE**: These normalizations operate independently of tiny-count's --normalize-by-genomic/feature-hits commandline options. The former is concerned with per-library normalization, whereas the latter is concerned with normalization by each sequence's alignment count and the number of selected features at each locus ([more info](tiny-count.md#count-normalization)). The commandline option does not enable or disable the normalizations detailed above. ### Low DF Experiments DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced. diff --git a/doc/Parameters.md b/doc/Parameters.md index 915c7a9c..7de40edf 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -70,12 +70,19 @@ Optional arguments: Copies the template configuration files required by tiny-count into the current directory. This argument can't be combined with `--paths-file`. All other arguments are ignored when provided, and once the templates have been copied tiny-count exits. -### Normalize by Hits -| Run Config Key | Commandline Argument | -|----------------------------|---------------------------| -| counter-normalize-by-hits: | `--normalize-by-hits T/F` | +### Normalize by Genomic Hits +| Run Config Key | Commandline Argument | +|------------------------------------|-----------------------------------| +| counter_normalize_by_genomic_hits: | `--normalize-by-genomic-hits T/F` | -By default, tiny-count will divide the number of counts associated with each sequence, twice, before they are assigned to a feature. Each unique sequence's count is determined by tiny-collapse (or a compatible collapsing utility) and is preserved through the alignment process. The original count is divided first by the number of loci that the sequence aligns to, and second by the number of features passing selection at each locus. Switching this option "off" disables the latter normalization step. +By default, tiny-count will increment feature counts by a normalized amount to avoid overcounting. Each unique sequence's read count is determined by tiny-collapse (or a compatible collapsing utility) and is preserved through the alignment process. For sequences with multiple alignments, a portion of the sequence's original count is allocated to each of its alignments to be assigned to features that pass selection at the locus. This portion is the original count divided by the number of alignments, or _genomic hits_. By disabling this normalization step, each of the sequence's alignments will be allocated the full original read count rather than the normalized portion. + +### Normalize by Feature Hits +| Run Config Key | Commandline Argument | +|------------------------------------|-----------------------------------| +| counter_normalize_by_feature_hits: | `--normalize-by-feature-hits T/F` | + +By default, tiny-count will increment feature counts by a normalized amount to avoid overcounting. Each sequence alignment locus is allocated a portion of the sequence's original read count (depending on `counter_normalize_by_genomic_hits`), and once selection is complete the allocated count is divided by the number of selected features, or _feature hits_, at the alignment. The resulting value is added to the totals for each matching feature. By disabling this normalization step, each selected feature will receive the full amount allocated to the locus rather than the normalized portion. ### Decollapse | Run Config Key | Commandline Argument | diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 93af9b64..79f80f0e 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -138,7 +138,7 @@ Examples: >**Tip:** you may specify U and T bases in your rules. Uracil bases will be converted to thymine when your Features Sheet is loaded. N bases are also allowed. ## Count Normalization -Small RNA reads passing selection will receive a normalized count increment. By default, read counts are normalized twice before being assigned to a feature. The second normalization step can be disabled in `run_config.yml` if desired. Counts for each small RNA sequence are divided: +Small RNA reads passing selection will receive a normalized count increment. By default, read counts are normalized twice before being assigned to a feature. Both normalization steps can be disabled in `run_config.yml` if desired. Counts for each small RNA sequence are divided: 1. By the number of loci it aligns to in the genome. 2. By the number of _selected_ features for each of its alignments. From 5739a943941ec3317d2011d94727d829a5d41e83 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 8 Apr 2023 14:17:53 -0700 Subject: [PATCH 21/34] Cleaning up the tiny-count helpstring for normalization and stats verification so that it reads and formats more cleanly. Also updating the helpstring in Parameters.md --- doc/Parameters.md | 14 +++++++++----- tiny/rna/counter/counter.py | 8 ++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/Parameters.md b/doc/Parameters.md index 7de40edf..d085c17a 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -114,8 +114,8 @@ Diagnostic information will include intermediate alignment files for each librar ### Full tiny-count Help String ``` -tiny-count (-pf FILE | --get-templates) [-o PREFIX] [-nh T/F] [-dc] - [-sv {Cython,HTSeq}] [-p] [-d] +tiny-count (-pf FILE | --get-templates) [-o PREFIX] [-ng T/F] [-nf T/F] + [-vs T/F] [-dc] [-sv {Cython,HTSeq}] [-p] [-d] tiny-count is a precision counting tool for hierarchical classification and quantification of small RNA-seq reads @@ -139,9 +139,13 @@ Optional arguments: occurrences of the substring {timestamp} will be replaced with the current date and time. (default: tiny-count_{timestamp}) - -nh T/F, --normalize-by-hits T/F - If T/true, normalize counts by (selected) overlapping - feature counts. (default: T) + -ng T/F, --normalize-by-genomic-hits T/F + Normalize counts by genomic hits. (default: T) + -nf T/F, --normalize-by-feature-hits T/F + Normalize counts by feature hits. (default: T) + -vs T/F, --verify-stats T/F + Verify that all reported stats are internally + consistent. (default: T) -dc, --decollapse Create a decollapsed copy of all SAM files listed in your Samples Sheet. This option is ignored for non- collapsed inputs. (default: False) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index b6967930..4bb95f7d 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -54,17 +54,17 @@ def get_args(): help='The output prefix to use for file names. All occurrences of the ' 'substring {timestamp} will be replaced with the current date and time.') optional_args.add_argument('-ng', '--normalize-by-genomic-hits', metavar='T/F', default='T', - help='If T/true, normalize counts by genomic hits.') + help='Normalize counts by genomic hits.') optional_args.add_argument('-nf', '--normalize-by-feature-hits', metavar='T/F', default='T', - help='If T/true, normalize counts by feature hits.') + help='Normalize counts by feature hits.') + optional_args.add_argument('-vs', '--verify-stats', metavar='T/F', default='T', + help='Verify that all reported stats are internally consistent.') optional_args.add_argument('-dc', '--decollapse', action='store_true', help='Create a decollapsed copy of all SAM files listed in your Samples Sheet. ' 'This option is ignored for non-collapsed inputs.') optional_args.add_argument('-sv', '--stepvector', choices=['Cython', 'HTSeq'], default='Cython', help='Select which StepVector implementation is used to find ' 'features overlapping an interval.') - optional_args.add_argument('-vs', '--verify-stats', metavar='T/F', default='T', - help='If T/true, verify that all reported stats are internally consistent.') optional_args.add_argument('-a', '--all-features', action='store_true', help=argparse.SUPPRESS) #help='Represent all features in output counts table, ' # 'even if they did not match in Stage 1 selection.') From 7743fa20be06e09fe1f1484e5d615464534b6cdc Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 9 Apr 2023 12:29:58 -0700 Subject: [PATCH 22/34] Increasing rounding precision. 2 decimal places was far too strict for intermediate operations. Also increasing the equality tolerance by a small amount to help with erroneous warnings for large counting tasks --- tiny/rna/counter/statistics.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 3fa00673..cf24f975 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -46,7 +46,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) - corr_counts = round(read_counts / loci_counts, 2) if self.norm_gh else read_counts + corr_counts = round(read_counts / loci_counts, 8) if self.norm_gh else read_counts nt5, seqlen = bundle_read['nt5end'], bundle_read['Length'] self.library_stats['Mapped Reads Basis'] += read_counts @@ -74,7 +74,7 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n if asgn_count == 0: bundle['unassigned_reads'] += corr_count else: - fcorr_count = round(corr_count / asgn_count, 2) if self.norm_fh else corr_count + fcorr_count = round(corr_count / asgn_count, 8) if self.norm_fh else corr_count bundle['assigned_reads'] += fcorr_count * asgn_count bundle['assigned_ftags'] |= assignments.keys() @@ -91,8 +91,8 @@ def finalize_bundle(self, bundle: dict) -> None: """Called at the conclusion of processing each multiple-alignment bundle""" assigned_feat_count = len({feat[0] for feat in bundle['assigned_ftags']}) - unassigned_reads = round(bundle['unassigned_reads'], 2) - assigned_reads = round(bundle['assigned_reads'], 2) + unassigned_reads = round(bundle['unassigned_reads'], 8) + assigned_reads = round(bundle['assigned_reads'], 8) self.library_stats['Total Unassigned Reads'] += unassigned_reads bundle['nt5_mapped'][bundle['seq_length']] += assigned_reads + unassigned_reads @@ -812,11 +812,10 @@ def validate_nt_len_matrices(self, matrices, sum_stats): MergedStat.add_warning(self.indent_vs(a_sum, s_df.loc['Assigned Reads', lib_name])) @staticmethod - def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=1.0): + def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=2.0): """Tolerate small differences in floating point numbers due to floating point error. The error tends to be greater for stats that are incremented many times by small - amounts. The default tolerance is definitely too generous for the error, - but seems conceptually appropriate for read counts.""" + amounts. The default tolerance is likely too generous.""" approx = abs(x - y) < tolerance if isinstance(approx, pd.Series): From 5f489ffeb53b4b52cc47e68afb0aff4aa7e35d8f Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:18:31 -0700 Subject: [PATCH 23/34] Removing rounding steps for floating point error mitigation. After further reflection and testing, this was hurting accuracy more than it was helping --- tiny/rna/counter/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index cf24f975..5de1297e 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -46,7 +46,7 @@ def count_bundle(self, aln_bundle: iter, read_counts: int) -> dict: bundle_read = aln_bundle[0] loci_counts = len(aln_bundle) - corr_counts = round(read_counts / loci_counts, 8) if self.norm_gh else read_counts + corr_counts = read_counts / loci_counts if self.norm_gh else read_counts nt5, seqlen = bundle_read['nt5end'], bundle_read['Length'] self.library_stats['Mapped Reads Basis'] += read_counts @@ -74,7 +74,7 @@ def count_bundle_assignments(self, bundle: dict, aln: dict, assignments: dict, n if asgn_count == 0: bundle['unassigned_reads'] += corr_count else: - fcorr_count = round(corr_count / asgn_count, 8) if self.norm_fh else corr_count + fcorr_count = corr_count / asgn_count if self.norm_fh else corr_count bundle['assigned_reads'] += fcorr_count * asgn_count bundle['assigned_ftags'] |= assignments.keys() @@ -91,8 +91,8 @@ def finalize_bundle(self, bundle: dict) -> None: """Called at the conclusion of processing each multiple-alignment bundle""" assigned_feat_count = len({feat[0] for feat in bundle['assigned_ftags']}) - unassigned_reads = round(bundle['unassigned_reads'], 8) - assigned_reads = round(bundle['assigned_reads'], 8) + unassigned_reads = bundle['unassigned_reads'] + assigned_reads = bundle['assigned_reads'] self.library_stats['Total Unassigned Reads'] += unassigned_reads bundle['nt5_mapped'][bundle['seq_length']] += assigned_reads + unassigned_reads From 5ee804fafb9c961ccbc5d753d1b6bb3b0c2eb2e9 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:22:02 -0700 Subject: [PATCH 24/34] Adding clarified docstrings to the finalize() and write_output_logfile() abstract methods. After further testing I realized that the trouble we were seeing with internal consistency disagreement was actually caused by prematurely rounding final counts in finalize(). Counts should instead be rounded to 2 decimal places in write_output_logfile(). --- tiny/rna/counter/statistics.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 5de1297e..c0b1017b 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -120,10 +120,21 @@ class MergedStat(ABC): def add_library(self, other: LibraryStats): ... @abstractmethod - def finalize(self): ... # Called before validating stats + def finalize(self): + """Called once all libraries have been added to the MergedStatsManager. + The implementation of this method should perform any preparation steps required for + stats validation, such as building dataframes or sorting columns so the appropriate + comparisons can be made, but it should not perform any operations that reduce decimal + precision, such as rounding or conversion to int. Validation depends on this precision + for internal consistency checks. Rounding should instead be done in write_output_logfile()""" + ... @abstractmethod - def write_output_logfile(self): ... + def write_output_logfile(self): + """Called once all stats classes have been validated. + The implementation of this method should round counts to the appropriate decimal + precision or convert float values to int if necessary, then write output CSV files.""" + ... @staticmethod def add_warning(msg): From 3bde3d376f79d28de6a74017cb565ba20eb5f35e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:24:36 -0700 Subject: [PATCH 25/34] Updating df_to_csv() so that FeatureCounts can use it with its multiindex. It also makes more sense to set the index name in each class' constructor instead of in this method --- tiny/rna/counter/statistics.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index c0b1017b..a70e5636 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -146,16 +146,26 @@ def sort_cols_and_round(df: pd.DataFrame, axis="columns") -> pd.DataFrame: return df.round(decimals=2).sort_index(axis=axis) @staticmethod - def df_to_csv(df: pd.DataFrame, idx_name: str, prefix: str, postfix: str = None, sort_axis="columns"): + def df_to_csv(df: pd.DataFrame, prefix: str, postfix: str = None, sort_axis="columns"): + """Rounds counts, optionally sorts , and writes the dataframe + to CSV with the appropriate index label and filename. + + Args: + df: The dataframe to write to CSV + prefix: The output filename prefix (required) + postfix: The optional filename postfix. If not provided, a postfix is created from + the index name, which will be split on space, joined on "_", and made lowercase. + sort_axis: The axis to sort. Default: columns. If None, neither axis is sorted. + """ + if postfix is None: - postfix = '_'.join(map(str.lower, idx_name.split(' '))) + postfix = '_'.join(map(str.lower, df.index.name.split(' '))) if sort_axis is not None: out_df = MergedStat.sort_cols_and_round(df, axis=sort_axis) else: out_df = df.round(decimals=2) - out_df.index.name = idx_name file_name = make_filename([prefix, postfix]) out_df.to_csv(file_name) From f45ab41fb5b7f74845d7cc18b8fe5290c016d6cb Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:29:57 -0700 Subject: [PATCH 26/34] Updating FeatureCounts to label its multiindex during construction, to not round its values until write_output_logfile(), and to use MergedStats.df_to_csv() to write the feature_counts and norm_counts tables for consistency --- tiny/rna/counter/statistics.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index a70e5636..e637532b 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -226,6 +226,7 @@ def print_warnings(self): class FeatureCounts(MergedStat): def __init__(self, Features_obj): self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.classes.values())) + self.feat_counts_df.index.names = ["Feature ID", "Classifier"] self.aliases = Features_obj.aliases self.finalized = False self.norm_prefs = {} @@ -250,12 +251,10 @@ def finalize(self): Aliases are concatenated by ", " if multiple exist for a feature. Alias values are not added for the "ID" tag as the Feature ID column already contains this information.""" - # Sort columns by title and round all counts to 2 decimal places - summary = self.sort_cols_and_round(self.feat_counts_df) + # Order columns alphabetically by header (sample_rep_n) + summary = self.feat_counts_df.sort_index(axis="columns") # Add Feature Name column, which is the feature alias (default is blank if no alias exists) summary.insert(0, "Feature Name", summary.index.map(lambda feat: ', '.join(self.aliases.get(feat[0], '')))) - # Sort by index, make index its own column, and rename it to Feature ID - summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Classifier"}) self.feat_counts_df = summary self.finalized = True @@ -265,7 +264,7 @@ def write_feature_counts(self): Should be called after finalize()""" assert self.finalized, "FeatureCounts object must be finalized before writing output." - self.feat_counts_df.to_csv(self.prefix + '_feature_counts.csv', index=False) + self.df_to_csv(self.feat_counts_df, self.prefix, "feature_counts", sort_axis="index") def write_norm_counts(self): """Writes normalized feature counts to {prefix}_norm_counts.csv @@ -294,7 +293,7 @@ def write_norm_counts(self): counts[library] = counts[library] / factor - counts.to_csv(self.prefix + '_norm_counts.csv', index=False) + self.df_to_csv(counts, self.prefix, "norm_counts", sort_axis="index") class RuleCounts(MergedStat): From 63ed696478718f65d605c82e8f098fec86ab2efb Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:32:22 -0700 Subject: [PATCH 27/34] Updating RuleCounts to populate and label its index at construction and not round its values until write_output_logfile(). Also simplifying the dataframe operations in add_library() and finalize() so that they are consistent with other MergedStats classes. --- tiny/rna/counter/statistics.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index e637532b..6502164f 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -299,21 +299,19 @@ def write_norm_counts(self): class RuleCounts(MergedStat): def __init__(self, features_csv): self.rules = self.read_features_sheet(features_csv) - self.rule_counts_df = pd.DataFrame() + self.rule_counts_df = pd.DataFrame(index=range(len(self.rules))).rename_axis("Rule Index") self.finalized = False def add_library(self, other: LibraryStats): assert not self.finalized, "Cannot add libraries after RuleCounts object has been finalized." - counts = pd.Series(other.rule_counts, name=other.library['Name']) - self.rule_counts_df = self.rule_counts_df.join(counts, how='outer') + self.rule_counts_df[other.library["Name"]] = self.rule_counts_df.index.map(other.rule_counts) def finalize(self): - rules = self.get_rule_strings() - self.rule_counts_df = self.sort_cols_and_round(self.rule_counts_df) + self.rule_counts_df = self.rule_counts_df.sort_index(axis="columns").fillna(0) - # Add Rule String column to the left of the counts - order = ["Rule String"] + self.rule_counts_df.columns.to_list() - self.rule_counts_df = self.rule_counts_df.join(pd.Series(rules, name="Rule String"), how="outer")[order].fillna(0) + # Add Rule String column + rules = self.get_rule_strings() + self.rule_counts_df.insert(0, "Rule String", rules) # Add Mapped Reads row below the counts self.rule_counts_df.loc["Mapped Reads"] = SummaryStats.pipeline_stats_df.loc["Mapped Reads"] @@ -321,7 +319,7 @@ def finalize(self): def write_output_logfile(self): assert self.finalized, "RuleCounts object must be finalized before writing output." - self.df_to_csv(self.rule_counts_df, "Rule Index", self.prefix, 'counts_by_rule', sort_axis=None) + self.df_to_csv(self.rule_counts_df, self.prefix, 'counts_by_rule', sort_axis=None) def read_features_sheet(self, features_csv): """Reads the Features Sheet and returns a list of rules in From fcda06de882861563fdf5d0957846511da737702 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:43:40 -0700 Subject: [PATCH 28/34] Updating NtLenMatrices to not round its values until write_output_logfile(), and to use MergedStats.df_to_csv() to write its outputs for consistency. --- tiny/rna/counter/statistics.py | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 6502164f..4ba99419 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -374,18 +374,23 @@ def add_library(self, other: LibraryStats): def finalize(self): self.nt_len_matrices = dict(sorted(self.nt_len_matrices.items())) for lib_name, (mapped, assigned) in self.nt_len_matrices.items(): - mapped_nt_len_df = self.finalize_mapped(mapped) - assigned_nt_len_df = self.finalize_assigned(assigned) + mapped_nt_len_df = self._finalize_mapped(mapped) + assigned_nt_len_df = self._finalize_assigned(assigned) self.nt_len_matrices[lib_name] = (mapped_nt_len_df, assigned_nt_len_df) self.finalized = True - def finalize_mapped(self, mapped: DefaultDict[str, Counter]) -> pd.DataFrame: - return pd.DataFrame(mapped).sort_index().fillna(0).round(decimals=2) + def _finalize_mapped(self, mapped: DefaultDict[str, Counter]) -> pd.DataFrame: + return (pd.DataFrame(mapped, dtype='float64') + .rename_axis("Length") + .sort_index() + .fillna(0)) - def finalize_assigned(self, assigned: DefaultDict[str, Counter]) -> pd.DataFrame: + def _finalize_assigned(self, assigned: DefaultDict[str, Counter]) -> pd.DataFrame: # Fractional counts due to (loci count) and/or (assigned feature count) normalization - assigned_nt_len_df = pd.DataFrame(assigned, dtype='float64').sort_index().round(decimals=2) + assigned_nt_len_df = (pd.DataFrame(assigned, dtype='float64') + .rename_axis("Length") + .sort_index()) # Drop non-nucleotide columns if they don't contain counts assigned_nt_len_df.drop([ @@ -413,27 +418,21 @@ def _write_mapped(self, mapped: pd.DataFrame, lib_name: str): normalization step is disabled, these counts will be fractional. However, even when both normalization steps are enabled, we still get fractional counts that are very close to a whole number due to the way these counts - are tallied. They are fractional up to this point for the validation step. + are tallied. Floating point error is at the root of this issue. Up to + this point, they are fractional and rounded only by the limitations of + float representation for the purpose of validation.""" - Floating point error is at the root of this issue. If in LibraryStats we - instead increment these counts by the raw read count, then the matrix's - sum will disagree with the Mapped Reads summary stat because the constituent - stats of Mapped Reads are based on rounded values. Rounding maintains the - finite representation of these values. If we didn't round them and allowed - floating point error to accumulate over potentially millions+ of alignments, - then the disagreement between all stats would grow unpredictably and there - wouldn't be a meaningful way to validate them.""" + postfix = f"{lib_name}_mapped_nt_len_dist" if self.norm_gh ^ self.norm_fh: - mapped.to_csv(f'{self.prefix}_{lib_name}_mapped_nt_len_dist.csv') + self.df_to_csv(mapped, self.prefix, postfix, sort_axis=None) else: - mapped_whole_numbers = mapped.astype('int64') - mapped_whole_numbers.to_csv(f'{self.prefix}_{lib_name}_mapped_nt_len_dist.csv') + self.df_to_csv(mapped.astype('int64'), self.prefix, postfix, sort_axis=None) def _write_assigned(self, assigned: pd.DataFrame, lib_name: str): """Writes the assigned matrix to a file (no further work required).""" - assigned.to_csv(f'{self.prefix}_{lib_name}_assigned_nt_len_dist.csv') + self.df_to_csv(assigned, self.prefix, f"{lib_name}_assigned_nt_len_dist", sort_axis=None) class AlignmentStats(MergedStat): From 7f671eee506e430af5d38079b0025bf4621557d3 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:44:46 -0700 Subject: [PATCH 29/34] Updating AlignmentStats to not round its values until write_output_logfile(), and to label its index name at construction --- tiny/rna/counter/statistics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 4ba99419..a0a8f839 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -437,7 +437,8 @@ def _write_assigned(self, assigned: pd.DataFrame, lib_name: str): class AlignmentStats(MergedStat): def __init__(self): - self.alignment_stats_df = pd.DataFrame(index=LibraryStats.summary_categories) + self.alignment_stats_df = (pd.DataFrame(index=LibraryStats.summary_categories) + .rename_axis("Alignment Statistics")) self.finalized = False def add_library(self, other: LibraryStats): @@ -447,12 +448,12 @@ def add_library(self, other: LibraryStats): def finalize(self): self.alignment_stats_df.drop(index="Mapped Reads Basis", inplace=True) # Handled by SummaryStats - self.alignment_stats_df = self.sort_cols_and_round(self.alignment_stats_df) + self.alignment_stats_df.sort_index(axis="columns", inplace=True) self.finalized = True def write_output_logfile(self): assert self.finalized, "AlignmentStats object must be finalized before writing output." - self.df_to_csv(self.alignment_stats_df, "Alignment Statistics", self.prefix, 'alignment_stats') + self.df_to_csv(self.alignment_stats_df, self.prefix, 'alignment_stats') class SummaryStats(MergedStat): From e78431c2e50095bd1bb7cc0f52199996874f72fe Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:45:43 -0700 Subject: [PATCH 30/34] Updating SummaryStats to not round its values until write_output_logfile(), and to label its index name at construction --- tiny/rna/counter/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index a0a8f839..4c1fca4a 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -462,7 +462,7 @@ class SummaryStats(MergedStat): counted_categories = ["Mapped Sequences", "Normalized Mapped Reads", "Mapped Reads", "Assigned Reads"] summary_categories = conditional_categories + counted_categories - pipeline_stats_df = pd.DataFrame(index=summary_categories) + pipeline_stats_df = pd.DataFrame(index=summary_categories).rename_axis("Summary Statistics") def __init__(self, prefs): self.norm_gh = prefs.get('normalize_by_genomic_hits', True) @@ -528,7 +528,7 @@ def finalize(self): # Only display conditional categories if they were collected for at least one library empty_rows = self.pipeline_stats_df.loc[self.conditional_categories].isna().all(axis='columns') self.pipeline_stats_df.drop(empty_rows.index[empty_rows], inplace=True) - self.pipeline_stats_df = self.sort_cols_and_round(self.pipeline_stats_df) + self.pipeline_stats_df.sort_index(axis="columns", inplace=True) self.finalized = True def write_output_logfile(self): @@ -544,7 +544,7 @@ def write_output_logfile(self): differentiate = {'Mapped Reads': "Non-normalized Mapped Reads"} out_df = self.pipeline_stats_df.rename(index=differentiate) - self.df_to_csv(out_df, "Summary Statistics", self.prefix, "summary_stats") + self.df_to_csv(out_df, self.prefix, "summary_stats") def library_has_collapser_outputs(self, other: LibraryStats) -> bool: # Collapser outputs may have been gzipped. Accept either filename. From 11071809810a633e4ae9bfb605d180729b7e5299 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:48:57 -0700 Subject: [PATCH 31/34] Consistency updates for MergedDiags --- tiny/rna/counter/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 4c1fca4a..76b744c7 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -673,7 +673,7 @@ def record_summary_diagnostics(self, assignments, aln, bundle, n_candidates): class MergedDiags(MergedStat): def __init__(self): - self.assignment_diags = pd.DataFrame(index=Diagnostics.summary_categories) + self.assignment_diags = pd.DataFrame(index=Diagnostics.summary_categories).rename_axis("Assignment Diags") self.selection_diags = {} self.alignment_tables = {} @@ -691,7 +691,7 @@ def write_output_logfile(self): self.write_alignment_tables() def write_assignment_diags(self): - self.df_to_csv(self.assignment_diags, "Assignment Diags", self.prefix, "assignment_diags") + self.df_to_csv(self.assignment_diags, self.prefix) def write_alignment_tables(self): header = Diagnostics.alignment_columns From 203b4f974babad709e6cddc6ebe16ca0c7cd66c6 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:51:22 -0700 Subject: [PATCH 32/34] Minor consistency updates in StaticsticsValidator. Also updating handling of FeatureCounts now that it properly maintains its multiindex past finalize() and through the csv writing step --- tiny/rna/counter/statistics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 76b744c7..e5b416b3 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -718,8 +718,6 @@ def write_selection_diags(self): class StatisticsValidator: - filename_suffix = 'stats_check.csv' - def __init__(self, merged_stats: List[MergedStat]): self.stats = {type(stat).__name__: stat for stat in merged_stats} self.prefix = MergedStat.prefix @@ -772,9 +770,10 @@ def validate_stats(self, aln_stats, sum_stats): MergedStat.add_warning(self.indent_table(s_df.loc[NMR], s_df.loc(MR), index=("Normalized", "Non-normalized"))) if not self.approx_equal(res.loc["Sum"].sum(), 0): + outfile_name = "stats_check" MergedStat.add_warning("Unexpected disagreement between alignment and summary statistics.") - MergedStat.add_warning(f"See the checksum table ({self.filename_suffix})") - MergedStat.df_to_csv(res, "Checksum", self.prefix, self.filename_suffix) + MergedStat.add_warning(f"See the checksum table ({outfile_name}.csv)") + MergedStat.df_to_csv(res, self.prefix, outfile_name) def validate_counts(self, feat_counts, rule_counts, sum_stats): f_df = feat_counts.feat_counts_df @@ -789,7 +788,7 @@ def validate_counts(self, feat_counts, rule_counts, sum_stats): r_df_mapped = r_df.loc["Mapped Reads", r_cols] # Get sum of assigned counts in the FeatureCounts table - f_cols = f_df.columns.difference(["Feature ID", "Classifier", "Feature Name"], sort=False) + f_cols = f_df.columns.difference(["Feature Name"], sort=False) f_df_sum = f_df[f_cols].sum(axis=0) # Compare assigned counts in FeatureCounts, RuleCounts, and SummaryStats @@ -804,8 +803,7 @@ def validate_counts(self, feat_counts, rule_counts, sum_stats): MergedStat.add_warning(self.indent_table(r_df_sum, r_df_mapped, index=("Assigned", "Mapped"))) # Compare counts per classifier in FeatureCounts to corresponding rules in RuleCounts - f_cols = f_df.columns.difference(["Feature ID", "Feature Name"], sort=False) - f_cls_sums = f_df[f_cols].groupby("Classifier").sum() + f_cls_sums = f_df[f_cols].groupby(level="Classifier").sum() r_cls_idxs = rule_counts.get_inverted_classifiers() for cls, counts in f_cls_sums.iterrows(): f_cls_sum = f_cls_sums.loc[cls].sum() @@ -830,7 +828,7 @@ def validate_nt_len_matrices(self, matrices, sum_stats): MergedStat.add_warning(self.indent_vs(a_sum, s_df.loc['Assigned Reads', lib_name])) @staticmethod - def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=2.0): + def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=0.01): """Tolerate small differences in floating point numbers due to floating point error. The error tends to be greater for stats that are incremented many times by small amounts. The default tolerance is likely too generous.""" From dae930d72383959186f5552cc1388190496699b1 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 16:58:59 -0700 Subject: [PATCH 33/34] Removing the sort_cols_and_round() helper function because it is no longer used by MergedStats classes --- tiny/rna/counter/statistics.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index e5b416b3..faab96c6 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -141,14 +141,8 @@ def add_warning(msg): MergedStatsManager.warnings.append(msg) @staticmethod - def sort_cols_and_round(df: pd.DataFrame, axis="columns") -> pd.DataFrame: - """Convenience function to sort columns by title and round all values to 2 decimal places""" - return df.round(decimals=2).sort_index(axis=axis) - - @staticmethod - def df_to_csv(df: pd.DataFrame, prefix: str, postfix: str = None, sort_axis="columns"): - """Rounds counts, optionally sorts , and writes the dataframe - to CSV with the appropriate index label and filename. + def df_to_csv(df: pd.DataFrame, prefix: str, postfix: str = None, sort_axis: Optional[str] = "columns"): + """Rounds counts, optionally sorts an axis, and writes the dataframe to CSV with the appropriate filename. Args: df: The dataframe to write to CSV @@ -162,7 +156,7 @@ def df_to_csv(df: pd.DataFrame, prefix: str, postfix: str = None, sort_axis="col postfix = '_'.join(map(str.lower, df.index.name.split(' '))) if sort_axis is not None: - out_df = MergedStat.sort_cols_and_round(df, axis=sort_axis) + out_df = df.round(decimals=2).sort_index(axis=sort_axis) else: out_df = df.round(decimals=2) From 0672c3c364dabdb095961b34b1a0aaea4ba8b587 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 16 Apr 2023 18:36:21 -0700 Subject: [PATCH 34/34] Small correction for mapped len dist tables. Rounding should be performed before conversion to int since .astype('int64') is functionally equivalent to .apply(np.floor). Also raising the default equality tolerance in StatisticsValidator back to 1.0 --- tiny/rna/counter/statistics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index faab96c6..41fbcfa7 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -421,7 +421,8 @@ def _write_mapped(self, mapped: pd.DataFrame, lib_name: str): if self.norm_gh ^ self.norm_fh: self.df_to_csv(mapped, self.prefix, postfix, sort_axis=None) else: - self.df_to_csv(mapped.astype('int64'), self.prefix, postfix, sort_axis=None) + mapped_whole_numbers = mapped.round(decimals=2).astype('int64') + self.df_to_csv(mapped_whole_numbers, self.prefix, postfix, sort_axis=None) def _write_assigned(self, assigned: pd.DataFrame, lib_name: str): """Writes the assigned matrix to a file (no further work required).""" @@ -822,10 +823,10 @@ def validate_nt_len_matrices(self, matrices, sum_stats): MergedStat.add_warning(self.indent_vs(a_sum, s_df.loc['Assigned Reads', lib_name])) @staticmethod - def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=0.01): + def approx_equal(x: Union[pd.Series, int], y: Union[pd.Series, int], tolerance=1.0): """Tolerate small differences in floating point numbers due to floating point error. The error tends to be greater for stats that are incremented many times by small - amounts. The default tolerance is likely too generous.""" + amounts. The default tolerance, while conceptually appropriate, is excessive.""" approx = abs(x - y) < tolerance if isinstance(approx, pd.Series):