From e77c243aae57a96320f6ae44c3ea32d9742257ef Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 17:32:28 -0700 Subject: [PATCH 01/16] Changing the internal and external names of the Tag column --- START_HERE/features.csv | 2 +- tiny/rna/configuration.py | 2 +- tiny/rna/counter/counter.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/START_HERE/features.csv b/START_HERE/features.csv index 959a1e73..bd2cc089 100755 --- a/START_HERE/features.csv +++ b/START_HERE/features.csv @@ -1,4 +1,4 @@ -Select for...,with value...,Alias by...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source +Select for...,with value...,Alias by...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source Class,mask,Alias,,1,both,all,all,Partial,./reference_data/ram1.gff3 Class,miRNA,Alias,,2,sense,all,16-22,Full,./reference_data/ram1.gff3 Class,piRNA,Alias,5pA,2,both,A,24-32,Full,./reference_data/ram1.gff3 diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index dde86823..b4c18c33 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -365,7 +365,7 @@ class CSVReader(csv.DictReader): "Select for...": "Key", "with value...": "Value", "Alias by...": "Name", - "Tag": "Tag", + "Classify as...": "Class", "Hierarchy": "Hierarchy", "Strand": "Strand", "5' End Nucleotide": "nt5end", diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index c7f48f0b..da90e2b7 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -143,7 +143,7 @@ def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[ rules, gff_files = list(), defaultdict(list) for row in CSVReader(features_csv, "Features Sheet").rows(): - rule = {col: row[col] for col in ["Tag", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]} + rule = {col: row[col] for col in ["Class", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]} rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base rule['Identity'] = (row['Key'], row['Value']) # Create identity tuple rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number From bc2adac643f8a1a921ec7b7b0e1eb2913207f95c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 17:36:46 -0700 Subject: [PATCH 02/16] First draft changes in tiny-count Removing the accounting of the Class= attribute from ReferenceTables, usages of this output in the Features data class, and the FeatureCounts output file class. Also correcting some erroneous use of the StepVector typehint (this should have been GenomicArray). The Feature Class column has been removed from the output feature_counts.csv table, and the Tag column has been renamed to "Classifier" --- tiny/rna/counter/features.py | 4 +--- tiny/rna/counter/hts_parsing.py | 39 +++++++++++++-------------------- tiny/rna/counter/statistics.py | 6 +---- 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 675d5c45..63fe4191 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -17,13 +17,11 @@ class Features(metaclass=Singleton): chrom_vectors: HTSeq.ChromVector aliases: dict - classes: dict tags: dict - def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict): + def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, tags: dict): Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups Features.aliases = aliases # For feature ID -> preferred feature name lookups - Features.classes = classes # For feature ID -> class lookups Features.tags = tags # For feature ID -> match IDs diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 4e577285..9e49c9de 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -428,8 +428,8 @@ def parse_gff(file, row_fn: Callable, alias_keys=None): # Type aliases for human readability -ClassTable = AliasTable = DefaultDict[str, Tuple[str]] -StepVector = HTSeq.GenomicArrayOfSets +AliasTable = DefaultDict[str, Tuple[str]] +GenomicArray = HTSeq.GenomicArrayOfSets Tags = DefaultDict[str, Set[str]] @@ -471,7 +471,6 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): self.parents, self.filtered = {}, set() # Original Feature ID self.intervals = defaultdict(list) # Root Feature ID self.matches = defaultdict(set) # Root Match ID - self.classes = defaultdict(set) # Root Feature ID self.alias = defaultdict(set) # Root Feature ID self.tags = defaultdict(set) # Root Feature ID -> Root Match ID @@ -479,7 +478,7 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string)) @report_execution_time("GFF parsing") - def get(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]: + def get(self) -> Tuple[GenomicArray, AliasTable, dict]: """Initiates GFF parsing and returns the resulting reference tables""" for file, alias_keys in self.gff_files.items(): @@ -494,14 +493,14 @@ def parse_row(self, row, alias_keys=None): # Grab the primary key for this feature feature_id = self.get_feature_id(row) - # Get feature's classes and identity match tuples - matches, classes = self.get_matches_and_classes(row.attr) + # Perform Stage 1 selection + matches = self.get_matches(row.attr) # Only add features with identity matches if all_features is False if not self.all_features and not len(matches): self.exclude_row(row) return # Add feature data to root ancestor in the reference tables - root_id = self.add_feature(feature_id, row, matches, classes) + root_id = self.add_feature(feature_id, row, matches) # Add alias to root ancestor if it is unique self.add_alias(root_id, alias_keys, row.attr) @@ -531,14 +530,13 @@ def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs return lineage - def add_feature(self, feature_id: str, row, matches: defaultdict, classes: set) -> str: - """Adds the feature to classes and intervals tables under its root ID, and to the matches table + def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: + """Adds the feature to the intervals table under its root ID, and to the matches table under its tagged ID. Note: matches are later assigned to intervals in _finalize_features()""" lineage = self.get_feature_ancestors(feature_id, row.attr) root_id = self.get_root_feature(lineage) - self.classes[root_id] |= classes if row.iv not in self.intervals[root_id]: self.intervals[root_id].append(row.iv) @@ -562,25 +560,22 @@ def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensit for row_val in row_attrs.get(alias_key, ()): self.alias[root_id].add(row_val) - def get_matches_and_classes(self, row_attrs: CaseInsensitiveAttrs) -> Tuple[DefaultDict, set]: - """Grabs classes and match tuples from attributes that match identity rules (Stage 1 Selection)""" - - row_attrs.setdefault("Class", ("_UNKNOWN_",)) - classes = {c for c in row_attrs["Class"]} + def get_matches(self, row_attrs: CaseInsensitiveAttrs) -> DefaultDict: + """Performs Stage 1 selection and returns match tuples under their associated classifier""" identity_matches = defaultdict(set) for ident, rule_indexes in self.selector.inv_ident.items(): if row_attrs.contains_ident(ident): for index in rule_indexes: # Non-tagged matches are pooled under '' empty string - tag = self.selector.rules_table[index]['Tag'] + tag = self.selector.rules_table[index]['Class'] identity_matches[tag].add(( index, self.selector.rules_table[index]['Hierarchy'], self.selector.rules_table[index]['Overlap'] )) # -> identity_matches: {tag: (rule, rank, overlap), ...} - return identity_matches, classes + return identity_matches def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str: """Get the current feature's parent while cooperating with filtered features""" @@ -615,11 +610,10 @@ def exclude_row(self, row): self.parents[feature_id] = self.get_row_parent(feature_id, row.attr) self.chrom_vector_setdefault(row.iv.chrom) - def finalize_tables(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]: + def finalize_tables(self) -> Tuple[GenomicArray, AliasTable, dict]: """Convert sets to sorted tuples for performance, hashability, and deterministic outputs Matches must also be assigned to intervals now that all intervals are known""" - self._finalize_classes() self._finalize_aliases() self._finalize_features() @@ -627,10 +621,10 @@ def finalize_tables(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]: raise ValueError("No features were retained while parsing your GFF file.\n" "This may be due to a lack of features matching 'Select for...with value...'") - return self.feats, self.alias, self.classes, self.tags + return self.feats, self.alias, self.tags def _finalize_features(self): - """Assigns matches to their corresponding intervals by populating StepVector with match tuples""" + """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples""" for root_id, unmerged_sub_ivs in self.intervals.items(): merged_sub_ivs = self._merge_adjacent_subintervals(unmerged_sub_ivs) @@ -676,9 +670,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} - def _finalize_classes(self): - self.classes = {feat: tuple(sorted(classes, key=str.lower)) for feat, classes in self.classes.items()} - def get_feats_table_size(self) -> int: """Returns the sum of features across all chromosomes and strands""" diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 8d625564..33527154 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -186,7 +186,6 @@ def print_warnings(self): class FeatureCounts(MergedStat): def __init__(self, Features_obj): self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.tags.values())) - self.classes = Features_obj.classes self.aliases = Features_obj.aliases self.norm_prefs = {} @@ -220,11 +219,8 @@ def write_feature_counts(self) -> pd.DataFrame: summary = self.sort_cols_and_round(self.feat_counts_df) # Add Feature Name column, which is the feature alias (default is blank if no alias exists) summary.insert(0, "Feature Name", summary.index.map(lambda feat: ', '.join(self.aliases.get(feat[0], '')))) - # Add Classes column for classes associated with the given feature - feat_class_map = lambda feat: ', '.join(self.classes[feat[0]]) - summary.insert(1, "Feature Class", summary.index.map(feat_class_map)) # Sort by index, make index its own column, and rename it to Feature ID - summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Tag"}) + summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Classifier"}) summary.to_csv(self.prefix + '_feature_counts.csv', index=False) return summary From 5f24e6c19d9db799ab597c9a6c2a749aa6d80407 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 17:51:33 -0700 Subject: [PATCH 03/16] tiny-deseq.r has been updated to handle feature counts using the new table format. The Feature Class column has been dropped and the Tag column has been renamed to Classifier. Also further improved code flexibility for tiny-deseq.r by not hardcoding character columns when calling write.csv --- tiny/rna/tiny-deseq.r | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r index 07ef86e1..391e2a3d 100755 --- a/tiny/rna/tiny-deseq.r +++ b/tiny/rna/tiny-deseq.r @@ -79,8 +79,8 @@ with_names <- function(vec){ } ## Original column names -idx_cols <- with_names(c('Feature ID', 'Tag')) -meta_cols <- with_names(c('Feature Name', 'Feature Class')) +idx_cols <- with_names(c('Feature ID', 'Classifier')) +meta_cols <- with_names('Feature Name') char_cols <- with_names(unique(c(idx_cols, meta_cols))) sample_cols <- with_names(colnames(header_row)[!(colnames(header_row) %in% char_cols)]) @@ -168,13 +168,13 @@ if (plot_pca){ } ## Get normalized counts and write them to CSV with original sample names in header -deseq_counts <- df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE)) -colnames(deseq_counts) <- c(meta_cols, sample_cols) +deseq_counts <- restore_multiindex(df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE))) +colnames(deseq_counts) <- c(idx_cols, meta_cols, sample_cols) write.csv( - restore_multiindex(deseq_counts), + deseq_counts, paste(out_pref, "norm_counts.csv", sep="_"), row.names=FALSE, - quote=1:4, + quote=which(colnames(deseq_counts) %in% char_cols), ) if (has_control){ @@ -190,15 +190,16 @@ if (has_control){ } write_dge_table <- function (dge_df, cond1, cond2){ + dge_df <- restore_multiindex(dge_df) write.csv( - restore_multiindex(dge_df), + dge_df, paste(out_pref, "cond1", cond1, "cond2", cond2, "deseq_table.csv", sep="_"), row.names=FALSE, - quote=1:4, + quote=which(colnames(dge_df) %in% char_cols), ) } From dace53d54b0f0614118f3ae854599eedeec4f1da Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 17 Oct 2022 18:44:30 -0700 Subject: [PATCH 04/16] class_charts and scatter_by_dge_class have been updated to treat the Tag multiindex column as a classifier. Wow, this opened the door for some really satisfying simplifications to the class-related codebase. Backward compatibility is not offered with this commit. I've been thinking about how to reconcile this but I think ultimately that will be a bad idea. Number one, the counting semantics of the old Feature Class vs Tag columns are completely different; these inputs wouldn't be interchangeable. I've also removed the show_unknown option from scatter_dges() because we have yet to use this feature and we've never added the option to any user-facing config files --- tiny/rna/plotter.py | 139 +++++++++++--------------------------------- 1 file changed, 34 insertions(+), 105 deletions(-) diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py index 7ab2ca14..e6ded67d 100644 --- a/tiny/rna/plotter.py +++ b/tiny/rna/plotter.py @@ -324,48 +324,35 @@ def load_dge_tables(comparisons: list) -> pd.DataFrame: print("Warning: multiple conditions matched in DGE filename. Using first match.") comparison_name = "_vs_".join(comparison[0]) - table = pd.read_csv(dgefile).rename({"Unnamed: 0": "Feature ID"}, axis="columns") - table = table.set_index( - ["Feature ID", "Tag"] - # For backward compatability - if "Tag" in table.columns - else "Feature ID") + table = set_counts_table_multiindex(pd.read_csv(dgefile)) de_table[comparison_name] = table['padj'] return de_table -def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, show_unknown=True, pval=0.05): +def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0.05): """Creates PDFs of all pairwise comparison scatter plots from a count table. Can highlight classes and/or differentially expressed genes as different colors. Args: - count_df: A dataframe of counts per feature + count_df: A dataframe of counts per feature with multiindex (feature ID, classifier) dges: A dataframe of differential gene table output to highlight view_lims: A tuple of (min, max) data bounds for the plot view output_prefix: A string to use as a prefix for saving files - classes: An optional dataframe of class(es) per feature. If provided, points are grouped by class - show_unknown: If true, class "unknown" will be included if highlighting by classes + classes: An optional feature-class multiindex. If provided, points are grouped by class pval: The pvalue threshold for determining the outgroup """ if classes is not None: - uniq_classes = sorted(list(pd.unique(classes)), key=str.lower) + uniq_classes = sorted(pd.unique(classes.get_level_values(1)), key=str.lower) class_colors = aqplt.assign_class_colors(uniq_classes) aqplt.set_dge_class_legend_style() - if not show_unknown and 'unknown' in uniq_classes: - uniq_classes.remove('unknown') - for pair in dges: p1, p2 = pair.split("_vs_") - # Get list of differentially expressed features for this comparison pair - dge_list = list(dges.index[dges[pair] < pval]) - # Create series of feature -> class relationships - class_dges = classes.loc[dge_list] - # Gather lists of features by class (listed in order corresponding to unique_classes) - grp_args = [class_dges.index[class_dges == cls].tolist() for cls in uniq_classes] + dge_dict = (dges[pair] < pval).groupby(level=1).groups + grp_args = [dge_dict[cls] for cls in uniq_classes] labels = uniq_classes sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args, colors=class_colors, @@ -400,15 +387,10 @@ def load_raw_counts(raw_counts_file: str) -> pd.DataFrame: Args: raw_counts_file: The raw counts CSV produced by tiny-count Returns: - The raw counts DataFrame (multiindex if it contains a Tag column) + The raw counts DataFrame with a multiindex of (feature id, classifier) """ - raw_counts = pd.read_csv(raw_counts_file) - return tokenize_feature_classes(raw_counts)\ - .set_index(["Feature ID", "Tag"] - # For backward compatability - if "Tag" in raw_counts.columns - else "Feature ID") + return set_counts_table_multiindex(pd.read_csv(raw_counts_file)) def load_rule_counts(rule_counts_file: str) -> pd.DataFrame: @@ -416,7 +398,7 @@ def load_rule_counts(rule_counts_file: str) -> pd.DataFrame: Args: rule_counts_file: The rule counts CSV produced by tiny-count Returns: - The rule counts DataFrame + The raw counts DataFrame with a multiindex of (feature id, classifier) """ return pd.read_csv(rule_counts_file, index_col=0) @@ -427,104 +409,51 @@ def load_norm_counts(norm_counts_file: str) -> pd.DataFrame: Args: norm_counts_file: The norm counts CSV produced by DESeq2 Returns: - The norm counts DataFrame (multiindex if it contains a Tag column) + The raw counts DataFrame with a multiindex of (feature id, classifier) """ - norm_counts = pd.read_csv(norm_counts_file)\ - .rename({"Unnamed: 0": "Feature ID"}, axis="columns") - return tokenize_feature_classes(norm_counts)\ - .set_index(["Feature ID", "Tag"] - # For backward compatability - if "Tag" in norm_counts.columns - else "Feature ID") + return set_counts_table_multiindex(pd.read_csv(norm_counts_file)) -def tokenize_feature_classes(counts_df: pd.DataFrame) -> pd.DataFrame: - """Parses each Feature Class element into a list. The resulting "lists" are - tuples to allow for .groupby() operations - Args: - counts_df: a DataFrame containing Feature Class list strings - Returns: a new DataFrame with tokenized Feature Class elements - """ - - tokenized = counts_df.copy() - tokenized["Feature Class"] = ( - counts_df["Feature Class"] - .str.split(',') - .apply(lambda classes: tuple(c.strip() for c in classes)) - ) +def set_counts_table_multiindex(counts: pd.DataFrame, fillna='_UNKNOWN_') -> pd.DataFrame: + """Assigns a multiindex composed of (Feature ID, Classifier) to the counts table, + and fills NaN values in the classifier""" - return tokenized + level0 = "Feature ID" + level1 = "Classifier" + counts[level1] = counts[level1].fillna(fillna) + return counts.set_index([level0, level1]) -def get_flat_classes(counts_df: pd.DataFrame) -> pd.Series: - """Features with multiple associated classes must have these classes flattened - Args: - counts_df: A counts DataFrame with a list-parsed Feature Class column - Returns: - A Series with the same index as the input and (optionally tagged) single classes - per index entry. Index repetitions are allowed for accommodating all classes. - """ - # For backward compatability - if isinstance(counts_df.index, pd.MultiIndex): - counts_df["Feature Class"] = counts_df.apply(tag_classes, axis="columns") +def get_flat_classes(counts_df: pd.DataFrame) -> pd.Index: + """Features with multiple associated classes are returned in flattened form + with one class per entry, yielding multiple entries for these features. During + earlier versions this required some processing, but now we can simply return + the multiindex of the counts_df. - return counts_df["Feature Class"].explode() - - -def tag_classes(row: pd.Series) -> tuple: - """Appends a feature's tag to its (tokenized) classes, if present. - Intended for use with df.apply() Args: - row: a series, as provided by .apply(... , axis="columns") - Returns: a tuple of tagged classes for a feature + counts_df: A DataFrame with a multiindex of (feature ID, feature class) + Returns: + The counts_df multiindex """ - # For backward compatability - tag = row.name[1] if type(row.name) is tuple else pd.NA - return tuple( - cls if pd.isna(tag) else f"{cls} ({tag})" - for cls in row["Feature Class"] - ) + return counts_df.index def get_class_counts(raw_counts_df: pd.DataFrame) -> pd.DataFrame: - """Calculates class counts from the Raw Counts DataFrame - - If there are multiple classes associated with a feature, then that feature's - counts are normalized across all associated classes. + """Calculates class counts from level 1 of the raw counts multiindex Args: - raw_counts_df: A DataFrame containing features and their associated classes and counts + raw_counts_df: A DataFrame with a multiindex of (feature ID, classifier) Returns: class_counts: A DataFrame with an index of classes and count entries, per comparison """ - # 1. Tokenize Feature Class lists and label each with the feature's Tag (if provided) - tokenized = raw_counts_df.copy().drop("Feature Name", axis="columns", errors='ignore') - tokenized['Feature Class'] = raw_counts_df.apply(tag_classes, axis="columns") - - # 2. Group and sum by Feature Class. Prepare for 3. by dividing by group's class count. - grouped = (tokenized.groupby("Feature Class") - .apply(lambda grp: grp.sum() / len(grp.name))) - - """ - # Sanity check! - tokenized.groupby("Feature Class").apply( - lambda x: print('\n' + str(x.name) + '\n' - f"{x.iloc[:,1:].sum()} / {len(x.name)} =" + '\n' + - f"{x.iloc[:,1:].sum() / len(x.name)}" + '\n' + - f"Total: {(x.iloc[:,1:].sum() / len(x.name)).sum()}" - ) - )""" - - # 3. Flatten class lists, regroup and sum by the normalized group counts from 2. - class_counts = grouped.reset_index().explode("Feature Class").groupby("Feature Class").sum() - - # Sanity check. Tolerance of 1 is overprovisioned for floating point errors - assert -1 < (class_counts.sum().sum() - raw_counts_df.iloc[:,2:].sum().sum()) < 1 - return class_counts + return (raw_counts_df + .drop("Feature Name", axis="columns", errors='ignore') + .groupby(level=1) + .sum()) def validate_inputs(args: argparse.Namespace) -> None: From eef300a169d7cfc5e912f8ad7c7414257f7c1620 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 12:16:04 -0700 Subject: [PATCH 05/16] Renaming the "tags" attribute in the Features class to "classes" for consistency. --- tiny/rna/counter/features.py | 6 +++--- tiny/rna/counter/hts_parsing.py | 14 +++++++------- tiny/rna/counter/statistics.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 63fe4191..44809d08 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -17,12 +17,12 @@ class Features(metaclass=Singleton): chrom_vectors: HTSeq.ChromVector aliases: dict - tags: dict + classes: dict - def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, tags: dict): + def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict): Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups Features.aliases = aliases # For feature ID -> preferred feature name lookups - Features.tags = tags # For feature ID -> match IDs + Features.classes = classes # For feature ID -> match IDs class FeatureCounter: diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 9e49c9de..a3ee92e9 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -495,7 +495,7 @@ def parse_row(self, row, alias_keys=None): feature_id = self.get_feature_id(row) # Perform Stage 1 selection matches = self.get_matches(row.attr) - # Only add features with identity matches if all_features is False + # Skip features that lack matches unless all_features is True if not self.all_features and not len(matches): self.exclude_row(row) return @@ -567,9 +567,9 @@ def get_matches(self, row_attrs: CaseInsensitiveAttrs) -> DefaultDict: for ident, rule_indexes in self.selector.inv_ident.items(): if row_attrs.contains_ident(ident): for index in rule_indexes: - # Non-tagged matches are pooled under '' empty string - tag = self.selector.rules_table[index]['Class'] - identity_matches[tag].add(( + # Unclassified matches are pooled under '' empty string + classifier = self.selector.rules_table[index]['Class'] + identity_matches[classifier].add(( index, self.selector.rules_table[index]['Hierarchy'], self.selector.rules_table[index]['Overlap'] @@ -623,6 +623,9 @@ def finalize_tables(self) -> Tuple[GenomicArray, AliasTable, dict]: return self.feats, self.alias, self.tags + def _finalize_aliases(self): + self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} + def _finalize_features(self): """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples""" @@ -667,9 +670,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) return merged_ivs - def _finalize_aliases(self): - self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} - def get_feats_table_size(self) -> int: """Returns the sum of features across all chromosomes and strands""" diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 33527154..24c73cbf 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -185,7 +185,7 @@ def print_warnings(self): class FeatureCounts(MergedStat): def __init__(self, Features_obj): - self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.tags.values())) + self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.classes.values())) self.aliases = Features_obj.aliases self.norm_prefs = {} From 138ad24d94ecb4f8a88bd0ea3da3235e7de3da2a Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 13:43:56 -0700 Subject: [PATCH 06/16] Adding two new command line options to tiny-plot: --unassigned-class-label and --unknown-class-label. These options retain their previous default values of _UNASSIGNED_ and _UNKNOWN_. Class labels in class_charts are now sorted. --- tiny/rna/plotter.py | 67 ++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py index e6ded67d..59a9b2ea 100644 --- a/tiny/rna/plotter.py +++ b/tiny/rna/plotter.py @@ -6,7 +6,6 @@ """ import multiprocessing as mp import pandas as pd -import numpy as np import itertools import traceback import argparse @@ -15,13 +14,16 @@ import re from collections import defaultdict -from typing import Optional, Dict, Union, Tuple, DefaultDict +from typing import Dict, Union, Tuple, DefaultDict from pkg_resources import resource_filename from tiny.rna.configuration import timestamp_format -from tiny.rna.plotterlib import plotterlib as lib +from tiny.rna.plotterlib import plotterlib from tiny.rna.util import report_execution_time, make_filename, SmartFormatter +aqplt: plotterlib +RASTER: bool + def get_args(): """Get input arguments from the user/command line.""" @@ -65,7 +67,11 @@ def get_args(): help='len_dist plots will start at this value') optional_args.add_argument('-lda', '--len-dist-max', metavar='VALUE', type=int, help='len_dist plots will end at this value') - + optional_args.add_argument('-una', '--unassigned-class', metavar='LABEL', default='_UNASSIGNED_', + help='Use this label in class-related plots for unassigned counts'), + optional_args.add_argument('-unk', '--unknown-class', metavar='LABEL', default='_UNKNOWN_', + help='Use this label in class-related plots for counts which were ' + 'assigned by rules lacking a "Classify as..." value') # Required arguments required_args.add_argument('-p', '--plots', metavar='PLOT', required=True, nargs='+', @@ -155,17 +161,19 @@ def get_len_dist_dict(files_list: list) -> DefaultDict[str, Dict[str, pd.DataFra return matrices -def class_charts(raw_class_counts: pd.DataFrame, mapped_reads: pd.Series, out_prefix: str, scale=2, **kwargs): +def class_charts(raw_class_counts: pd.DataFrame, mapped_reads: pd.Series, out_prefix: str, class_na: str, scale=2, **kwargs): """Create a PDF of the proportion of raw assigned counts vs total mapped reads for each class Args: raw_class_counts: A dataframe containing RAW class counts per library mapped_reads: A Series containing the total number of mapped reads per library out_prefix: The prefix to use when naming output PDF plots + class_na: The label to use for representing unassigned reads + scale: The decimal scale for table percentages, and for determining inclusion of "unassigned" kwargs: Additional keyword arguments to pass to pandas.DataFrame.plot() """ - class_props = get_proportions_df(raw_class_counts, mapped_reads, "_UNASSIGNED_", scale) + class_props = get_proportions_df(raw_class_counts, mapped_reads, class_na, scale).sort_index() max_prop = class_props.max().max() for library in raw_class_counts: @@ -248,7 +256,7 @@ def get_sample_rep_dict(df: pd.DataFrame) -> dict: """ sample_dict = defaultdict(list) - non_numeric_cols = ["Feature Class", "Feature Name"] + non_numeric_cols = ["Feature Name"] for col in df.columns: if col in non_numeric_cols: continue @@ -302,12 +310,13 @@ def scatter_replicates(count_df: pd.DataFrame, samples: dict, output_prefix: str rscat.figure.savefig(pdf_name) -def load_dge_tables(comparisons: list) -> pd.DataFrame: +def load_dge_tables(comparisons: list, class_fillna: str) -> pd.DataFrame: """Creates a new dataframe containing all features and padj values for each comparison from a list of DGE tables. Args: comparisons: DGE tables (files) produced by DESeq2 + class_fillna: the value to fill for NaN values in multiindex level 1 Returns: de_table: A single table of padj values per feature/comparison @@ -324,7 +333,7 @@ def load_dge_tables(comparisons: list) -> pd.DataFrame: print("Warning: multiple conditions matched in DGE filename. Using first match.") comparison_name = "_vs_".join(comparison[0]) - table = set_counts_table_multiindex(pd.read_csv(dgefile)) + table = set_counts_table_multiindex(pd.read_csv(dgefile), class_fillna) de_table[comparison_name] = table['padj'] @@ -355,8 +364,9 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0. grp_args = [dge_dict[cls] for cls in uniq_classes] labels = uniq_classes - sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args, colors=class_colors, - pval=pval, view_lims=view_lims, labels=labels, rasterized=RASTER) + sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args, + colors=class_colors, pval=pval, view_lims=view_lims, labels=labels, + rasterized=RASTER) sscat.set_title('%s vs %s' % (p1, p2)) sscat.set_xlabel("Log$_{2}$ normalized reads in " + p1) @@ -372,8 +382,9 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0. labels = ['p < %g' % pval] colors = aqplt.assign_class_colors(labels) - sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], grp_args, colors=colors, alpha=0.5, - pval=pval, view_lims=view_lims, labels=labels, rasterized=RASTER) + sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], grp_args, + colors=colors, alpha=0.5, pval=pval, view_lims=view_lims, labels=labels, + rasterized=RASTER) sscat.set_title('%s vs %s' % (p1, p2)) sscat.set_xlabel("Log$_{2}$ normalized reads in " + p1) @@ -382,15 +393,16 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0. sscat.figure.savefig(pdf_name) -def load_raw_counts(raw_counts_file: str) -> pd.DataFrame: +def load_raw_counts(raw_counts_file: str, class_fillna: str) -> pd.DataFrame: """Loads a raw_counts CSV as a DataFrame Args: raw_counts_file: The raw counts CSV produced by tiny-count + class_fillna: the value to fill for NaN values in multiindex level 1 Returns: The raw counts DataFrame with a multiindex of (feature id, classifier) """ - return set_counts_table_multiindex(pd.read_csv(raw_counts_file)) + return set_counts_table_multiindex(pd.read_csv(raw_counts_file), class_fillna) def load_rule_counts(rule_counts_file: str) -> pd.DataFrame: @@ -404,20 +416,21 @@ def load_rule_counts(rule_counts_file: str) -> pd.DataFrame: return pd.read_csv(rule_counts_file, index_col=0) -def load_norm_counts(norm_counts_file: str) -> pd.DataFrame: +def load_norm_counts(norm_counts_file: str, class_fillna: str) -> pd.DataFrame: """Loads a norm_counts CSV as a DataFrame Args: norm_counts_file: The norm counts CSV produced by DESeq2 + class_fillna: the value to fill for NaN values in multiindex level 1 Returns: The raw counts DataFrame with a multiindex of (feature id, classifier) """ - return set_counts_table_multiindex(pd.read_csv(norm_counts_file)) + return set_counts_table_multiindex(pd.read_csv(norm_counts_file), class_fillna) -def set_counts_table_multiindex(counts: pd.DataFrame, fillna='_UNKNOWN_') -> pd.DataFrame: +def set_counts_table_multiindex(counts: pd.DataFrame, fillna: str) -> pd.DataFrame: """Assigns a multiindex composed of (Feature ID, Classifier) to the counts table, - and fills NaN values in the classifier""" + and fills NaN values in the classifier column""" level0 = "Feature ID" level1 = "Classifier" @@ -530,11 +543,11 @@ def setup(args: argparse.Namespace) -> dict: fetched: Dict[str, Union[pd.DataFrame, pd.Series, dict, None]] = {} input_getters = { 'ld_matrices_dict': lambda: get_len_dist_dict(args.len_dist), - 'raw_counts_df': lambda: load_raw_counts(args.raw_counts), 'mapped_reads_ds': lambda: load_mapped_reads(args.summary_stats), - 'norm_counts_df': lambda: load_norm_counts(args.norm_counts), 'rule_counts_df': lambda: load_rule_counts(args.rule_counts), - 'de_table_df': lambda: load_dge_tables(args.dge_tables), + 'norm_counts_df': lambda: load_norm_counts(args.norm_counts, args.unknown_class), + 'raw_counts_df': lambda: load_raw_counts(args.raw_counts, args.unknown_class), + 'de_table_df': lambda: load_dge_tables(args.dge_tables, args.unknown_class), 'sample_rep_dict': lambda: get_sample_rep_dict(fetched["norm_counts_df"]), 'norm_counts_avg_df': lambda: get_sample_averages(fetched["norm_counts_df"], fetched["sample_rep_dict"]), 'feat_classes_df': lambda: get_flat_classes(fetched["norm_counts_df"]), @@ -554,14 +567,12 @@ def setup(args: argparse.Namespace) -> dict: @report_execution_time("Plotter runtime") def main(): - """ - Main routine - """ + args = get_args() validate_inputs(args) global aqplt, RASTER - aqplt = lib(args.style_sheet) + aqplt = plotterlib(args.style_sheet) RASTER = not args.vector_scatter inputs = setup(args) @@ -576,7 +587,7 @@ def main(): kwd = {} elif plot == 'class_charts': func = class_charts - arg = (inputs["class_counts_df"], inputs['mapped_reads_ds'], args.out_prefix) + arg = (inputs["class_counts_df"], inputs['mapped_reads_ds'], args.out_prefix, args.unassigned_class) kwd = {} elif plot == 'rule_charts': func = rule_charts @@ -623,6 +634,6 @@ def err(e): '2. Run "tiny replot --config your_run_config.yml"\n\t' ' (that\'s the processed run config) ^^^\n\n', file=sys.stderr) + if __name__ == '__main__': main() - From e0748a0664390d74d265b8e0b47804f75af06b55 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 14:06:08 -0700 Subject: [PATCH 07/16] Including new parameters for labelling unknown/unassigned classes --- doc/Parameters.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/Parameters.md b/doc/Parameters.md index b8ca2d63..c19634fc 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -254,6 +254,14 @@ The scatter plots produced by tiny-plot have rasterized points by default. This The min and/or max bounds for plotted lengths can be set with this option. See [tiny-plot's documentation](tiny-plot.md#length-bounds) for more information about how these values are determined if they aren't set. +### Labels for Class-related Plots +| Run Config Key | Commandline Argument | +|------------------------|----------------------| +| plot_unknown_class: | `--unknown-class` | +| plot_unassigned_class: | `--unassigned-class` | + +The labels that should be used for special groups in `class_charts` and `sample_avg_scatter_by_dge_class` plots. The "unknown" class group represents counts which were assigned by a Features Sheet rule which lacked a "Classify as..." label. The "unassigned" class group represents counts which weren't assigned to a feature. + ### Full tiny-plot Help String ``` tiny-plot [-rc RAW_COUNTS] [-nc NORM_COUNTS] [-uc RULE_COUNTS] @@ -318,4 +326,11 @@ Optional arguments: len_dist plots will start at this value -lda VALUE, --len-dist-max VALUE len_dist plots will end at this value + -una LABEL, --unassigned-class LABEL + Use this label in class-related plots for unassigned + counts + -unk LABEL, --unknown-class LABEL + Use this label in class-related plots for counts which + were assigned by rules lacking a "Classify as..." + value ``` \ No newline at end of file From 2f4da09775ca600d6b6a3420045eab7f38249480 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:32:04 -0700 Subject: [PATCH 08/16] Updated the Features Sheet example for the new column configuration --- doc/Configuration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/Configuration.md b/doc/Configuration.md index 4dbf5869..04c5f434 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -124,9 +124,9 @@ Supported values are: DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced. ## Features Sheet Details -| _Column:_ | Select for... | with value... | Alias by... | Tag | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source | -|------------|---------------|---------------|-------------|-----|-----------|--------|-------------------|--------|-------------|----------------| -| _Example:_ | Class | miRNA | Name | | 1 | sense | all | all | 5' anchored | ram1.gff3 | +| _Column:_ | Select for... | with value... | Alias by... | Classify as... | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source | +|------------|---------------|---------------|-------------|----------------|-----------|--------|-------------------|--------|-------------|----------------| +| _Example:_ | Class | miRNA | Name | miRNA | 1 | sense | all | all | 5' anchored | ram1.gff3 | The Features Sheet allows you to define selection rules that determine how features are chosen when multiple features are found overlap an alignment locus. Selected features are "assigned" a portion of the reads associated with the alignment. From 1c142dc35a4cab233e414d5154e5819d38acb136 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:34:38 -0700 Subject: [PATCH 09/16] Description of class counting via the Class= attribute has been removed from the file requirements table. Description of the feature_counts.csv output has also been updated --- README.md | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f3903cc6..01f4ee53 100644 --- a/README.md +++ b/README.md @@ -90,12 +90,12 @@ tiny get-template ### Requirements for User-Provided Input Files -| Input Type | File Extension | Requirements | -|----------------------------------------------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Reference annotations
[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "): | -| Sequencing data
[(example)](START_HERE/fastq_files) | FASTQ(.gz) | Files must be demultiplexed. | -| Reference genome
[(example)](START_HERE/reference_data/ram1.fa) | FASTA | Chromosome identifiers (e.g. Chr1): | -| Bowtie indexes (optional) 1 | ebwt | Must be small indexes (.ebwtl indexes are not supported) | +| Input Type | File Extension | Requirements | +|----------------------------------------------------------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Reference annotations
[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "): | +| Sequencing data
[(example)](START_HERE/fastq_files) | FASTQ(.gz) | Files must be demultiplexed. | +| Reference genome
[(example)](START_HERE/reference_data/ram1.fa) | FASTA | Chromosome identifiers (e.g. Chr1): | +| Bowtie indexes (optional) 1 | ebwt | Must be small indexes (.ebwtl indexes are not supported) |
1 Bowtie indexes can be created for you. See the [configuration file documentation](doc/Configuration.md#building-bowtie-indexes). @@ -175,17 +175,20 @@ A "collapsed" FASTA contains unique reads found in fastp's quality filtered FAST The tiny-count step produces a variety of outputs #### Feature Counts -Custom Python scripts and HTSeq are used to generate a single table of feature counts that includes columns for each library analyzed. A feature's _Feature ID_ and _Feature Class_ are simply the values of its `ID` and `Class` attributes. Features lacking a Class attribute will be assigned class `_UNKNOWN_`. We have also included a _Feature Name_ column which displays aliases of your choice, as specified in the _Alias by..._ column of the Features Sheet. If _Alias by..._ is set to`ID`, the _Feature Name_ column is left empty. +Custom Python scripts and HTSeq are used to generate a single table of feature counts which includes each counted library. Each matched feature is represented with the following metadata columns: +- **_Feature ID_** is determined, in order of preference, by one of the following GFF column 9 attributes: `ID`, `gene_id`, `Parent`. +- **_Classifier_** is determined by the rules in your Features Sheet. It is the _Classify as..._ value of each matching rule. Since multiple rules can match a feature, some Feature IDs will be listed multiple times with different classifiers. +- **_Feature Name_** displays aliases of your choice, as specified in the _Alias by..._ column of the Features Sheet. If _Alias by..._ is set to`ID`, the _Feature Name_ column is left empty. -For example, if your Features Sheet has a rule which specifies _Alias by..._ `sequence_name` and the GFF entry for this feature has the following attributes column: +For example, if your Features Sheet has a rule which specifies _Alias by..._ `sequence_name`, _Classify as..._ `miRNA`, and the GFF entry for this feature has the following attributes column: ``` -... ID=406904;sequence_name=mir-1,hsa-miR-1;Class=miRNA; ... +... ID=406904;sequence_name=mir-1,hsa-miR-1; ... ``` The row for this feature in the feature counts table would read: -| Feature ID | Feature Name | Feature Class | Group1_rep_1 | Group1_rep_2 | ... | -|------------|------------------|---------------|--------------|--------------|-----| -| 406904 | mir-1, hsa-miR-1 | miRNA | 1234 | 999 | ... | +| Feature ID | Classifier | Feature Name | Group1_rep_1 | Group1_rep_2 | ... | +|------------|------------|------------------|--------------|--------------|-----| +| 406904 | miRNA | mir-1, hsa-miR-1 | 1234 | 999 | ... | #### Normalized Counts If your Samples Sheet has settings for Normalization, an additional copy of the Feature Counts table is produced with the specified per-library normalizations applied. From 34e51fd3bbe436290ef6fae49fb555236564db12 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:36:40 -0700 Subject: [PATCH 10/16] The Tagged Counting section has been rewritten to describe the new "classified" counting approach. --- doc/tiny-count.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/tiny-count.md b/doc/tiny-count.md index ad5e1314..c5e447a6 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -26,11 +26,14 @@ Selection occurs in three stages, with the output of each stage as input to the 3. Finally, features are selected for read assignment based on the small RNA attributes of the alignment locus. Once reads are assigned to a feature, they are excluded from matches with larger hierarchy values. ## Stage 1: Feature Attribute Parameters -| _features.csv columns:_ | Select for... | with value... | Tag | -|-------------------------|---------------|---------------|-----| +| _features.csv columns:_ | Select for... | with value... | Classify as... | +|-------------------------|---------------|---------------|----------------| Each feature's column 9 attributes are searched for the key-value combinations defined in the `Select for...` and `with value...` columns. Features, and the rules they matched, are retained for later evaluation at alignment loci in Stages 2 and 3. +#### Feature Classification +You can optionally specify a classifier for each rule. These classifiers are later used to group and label counts in class-related plots. Features that match rules with a classifier are counted separately; the classifier becomes part of the feature's ID to create a distinct "sub-feature", and these sub-features continue to be treated as distinct in downstream DGE analysis. Classified features receive counts exclusively from the rule(s) which hold the same `Classify as...` value. Counts from multiple rules can be pooled by using the same classifier. In the final counts table, this value is displayed in the Classifier column of features matching the rule, and each feature-classifier pair is shown on its own row. + #### Value Lists Attribute keys are allowed to have multiple comma separated values, and these values are treated as a list; only one of the listed values needs to match the `with value...` to be considered a valid match to the rule. For example, if a rule contained `Class` and `WAGO` in these columns, then a feature with attributes `... ;Class=CSR,WAGO; ...` would be considered a match for the rule. @@ -39,9 +42,6 @@ Attribute keys are allowed to have multiple comma separated values, and these va #### Wildcard Support Wildcard values (`all`, `*`, or an empty cell) can be used in the `Select for...` / `with value...` fields. With this functionality you can evaluate features for the presence of an attribute key without regarding its values, or you can check all attribute keys for the presence of a specific value, or you can skip Stage 1 selection altogether to permit the evaluation of the complete feature set in Stage 2. In the later case, feature-rule matching pairs still serve as the basis for selection; each rule still applies only to its matching subset from previous Stages. -#### Tagged Counting (advanced) -You can optionally specify a tag for each rule. Feature assignments resulting from tagged rules will have reads counted separately from those assigned by non-tagged rules. This essentially creates a new "sub-feature" for each feature that a tagged rule matches, and these "sub-features" are treated as distinct during downstream DGE analysis. Additionally, these counts subsets can be pooled across any number of rules by specifying the same tag. We recommend using tag names which _do not_ pertain to the `Select for...` / `with value...` in order to avoid potentially confusing results in class-related plots. - ## Stage 2: Overlap and Hierarchy Parameters | _features.csv columns:_ | Hierarchy | Overlap | |-------------------------|-----------|---------| From ee7ca26690ed26cc5d6f390d9c4a405e2e245aaf Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:39:08 -0700 Subject: [PATCH 11/16] Removed references to Class= counting in the class_charts description. The count normalization section has been removed because it no longer applies with the new counting method. A link to the corresponding parameters has been added to the _UNKNOWN_ and _UNASSIGNED_ sections. --- doc/tiny-plot.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/tiny-plot.md b/doc/tiny-plot.md index 8b60d37a..4b9ed158 100644 --- a/doc/tiny-plot.md +++ b/doc/tiny-plot.md @@ -64,19 +64,21 @@ Percentage label darkness and bar colors reflect the magnitude of the rule's con ## class_charts -Features can have multiple classes associated with them, so it is useful to see the proportions of counts by class. The class_charts plot type shows the percentage of _mapped_ reads that were assigned to features by class. Each feature's associated classes are determined by the `Class=` attribute in your GFF files. +Features can have multiple classes associated with them, so it is useful to see the proportions of counts by class. The class_charts plot type shows the percentage of _mapped_ reads that were assigned to features by class. Each feature's associated classes are determined by the rules that it matched during Stage 1 selection, and is therefore determined by its GFF annotations.

class_chart with 8 classes

#### Class \_UNASSIGNED_ -This category represents the percentage of mapped reads that were unassigned. Sources of unassigned reads include: +This category represents the percentage of mapped reads that weren't assigned to any features. Sources of unassigned reads include: - A lack of features passing selection at alignment loci - Alignments which do not overlap with any features -#### Count Normalization -A feature with multiple associated classes will have its counts split evenly across these classes before being grouped and summed. +You can customize this label using the [unassigned class parameter.](Parameters.md#labels-for-class-related-plots) + +#### Class \_UNKNOWN_ +This category represents the percentage of mapped reads that matched rules which did not have a specified `Classify as...` value. You can customize this label using the [unknown class parameter.](Parameters.md#labels-for-class-related-plots) #### Class Chart Styles Proportions in rule_charts and class_charts are plotted using the same function. Styles are the same between the two. See [rule chart styles](#rule-chart-styles) for more info. From d240b024713654637825cb59eb084eb0825e4195 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:39:52 -0700 Subject: [PATCH 12/16] Adding new parameters for unassigned/unknown class labels to the CWL and Run Configs --- START_HERE/run_config.yml | 6 ++++++ tiny/cwl/tools/tiny-plot.cwl | 14 ++++++++++++++ tiny/cwl/workflows/tinyrna_wf.cwl | 4 ++++ tiny/templates/run_config_template.yml | 6 ++++++ 4 files changed, 30 insertions(+) diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index fc48ef7c..06ca1c54 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -297,6 +297,12 @@ plot_vector_points: False plot_len_dist_min: plot_len_dist_max: +##-- Use this label in class plots for counts assigned by rules lacking a classifier --## +plot_unknown_class: "_UNKNOWN_" + +##-- Use this label in class plots for unassigned counts --## +plot_unassigned_class: "_UNASSIGNED_" + ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # diff --git a/tiny/cwl/tools/tiny-plot.cwl b/tiny/cwl/tools/tiny-plot.cwl index b4490b77..f9309aba 100644 --- a/tiny/cwl/tools/tiny-plot.cwl +++ b/tiny/cwl/tools/tiny-plot.cwl @@ -74,6 +74,20 @@ inputs: prefix: -lda doc: "The last length to plot in the range for len_dist plots" + unknown_class_label: + type: string? + inputBinding: + prefix: -unk + doc: \ + 'Use this label in class-related plots for counts which were ' + 'assigned by rules lacking a "Classify as..." value' + + unassigned_class_label: + type: string? + inputBinding: + prefix: -una + doc: 'Use this label in class-related plots for unassigned counts' + out_prefix: type: string? inputBinding: diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index c1f67ada..dd8ddc15 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -102,6 +102,8 @@ inputs: plot_len_dist_max: int? plot_style_sheet: File? plot_pval: float? + plot_unknown_class: string? + plot_unassigned_class: string? # output directory names dir_name_bt_build: string @@ -256,6 +258,8 @@ steps: pickValue: all_non_null valueFrom: | $(self.length ? self[0] : null) + unknown_class_label: plot_unknown_class + unassigned_class_label: plot_unassigned_class dge_pval: plot_pval style_sheet: plot_style_sheet out_prefix: run_name diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 71d85e99..4cc54670 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -297,6 +297,12 @@ plot_vector_points: False plot_len_dist_min: plot_len_dist_max: +##-- Use this label in class plots for counts assigned by rules lacking a classifier --## +plot_unknown_class: "_UNKNOWN_" + +##-- Use this label in class plots for unassigned counts --## +plot_unassigned_class: "_UNASSIGNED_" + ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # From 436b8751f3c23ee421a5b049daa695d6dc845943 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 15:40:22 -0700 Subject: [PATCH 13/16] Updating column config in the features.csv template --- tiny/templates/features.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/templates/features.csv b/tiny/templates/features.csv index 57ec4a8e..b038764c 100755 --- a/tiny/templates/features.csv +++ b/tiny/templates/features.csv @@ -1,4 +1,4 @@ -Select for...,with value...,Alias by...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source +Select for...,with value...,Alias by...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source Class,mask,Alias,,1,both,all,all,Partial,../../START_HERE/reference_data/ram1.gff3 Class,miRNA,Alias,,2,sense,all,16-22,Full,../../START_HERE/reference_data/ram1.gff3 Class,piRNA,Alias,5pA,2,both,A,24-32,Full,../../START_HERE/reference_data/ram1.gff3 From 1d95e196dac702289f40a89400b1e76097513335 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 18 Oct 2022 16:10:18 -0700 Subject: [PATCH 14/16] Added a brief check in the CSV reader to catch usage of Features Sheets with a Tag column. Previously, the user would have been notified that the "Classify as..." column was missing from their Features Sheet, which isn't quite as helpful. --- tiny/rna/configuration.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index b4c18c33..fab620b2 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -406,6 +406,7 @@ def validate_csv_header(self, header: OrderedDict): doc_reference = self.tinyrna_sheet_fields[self.doctype] expected = {key.lower() for key in doc_reference.keys()} read_vals = {val.lower() for val in header.values() if val is not None} + self.check_backward_compatibility(read_vals) unknown = {col_name for col_name in read_vals if col_name not in expected} missing = expected - read_vals @@ -422,6 +423,17 @@ def validate_csv_header(self, header: OrderedDict): # Remap column order to match client's self.fieldnames = tuple(doc_ref_lowercase[key] for key in header_lowercase.values()) + def check_backward_compatibility(self, header_vals): + if self.doctype == "Features Sheet" and "tag" in header_vals: + raise ValueError('\n'.join([ + "It looks like you're using a Features Sheet from a version of tinyRNA", + 'that offered "tagged counting". The "Tag" header has been repurposed as a feature', + "classifier and its meaning within the pipeline has changed. Additionally, feature", + "class is no longer determined by the Class= attribute. Please review the Stage 1", + 'section in tiny-count\'s documentation, then rename the "Tag" column to', + '"Classify as..." to avoid this error.' + ])) + if __name__ == '__main__': Configuration.main() \ No newline at end of file From fe49acb558fe2893cd9241ed19c8a1dea9c7d8bf Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 19 Oct 2022 12:51:37 -0700 Subject: [PATCH 15/16] Unit tests have been updated for the new class/tagged counting approach --- tests/unit_test_helpers.py | 2 +- tests/unit_tests_counter.py | 6 +-- tests/unit_tests_hts_parsing.py | 67 +++++++++++++++------------------ 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index 0084cfc1..bd5a0fbc 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -14,7 +14,7 @@ rules_template = [{'Identity': ("Name", "N/A"), 'Strand': "both", 'Hierarchy': 0, - 'Tag': '', + 'Class': '', 'nt5end': "all", 'Length': "all", # A string is expected by FeatureSelector due to support for lists and ranges 'Overlap': "partial"}] diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 6d9b7cc3..4231db86 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -33,7 +33,7 @@ def setUpClass(self): 'Select for...': "Class", 'with value...': "CSR", 'Alias by...': "Alias", - 'Tag': '', + 'Classify as...': '', 'Hierarchy': "1", 'Strand': "antisense", "5' End Nucleotide": '"C,G,U"', # Needs to be double-quoted due to commas @@ -47,7 +47,7 @@ def setUpClass(self): _row = self.csv_feat_row_dict self.parsed_feat_rule = [{ 'Identity': (_row['Select for...'], _row['with value...']), - 'Tag': _row['Tag'], + 'Class': _row['Classify as...'], 'Hierarchy': int(_row['Hierarchy']), 'Strand': _row['Strand'], 'nt5end': _row["5' End Nucleotide"].upper().translate({ord('U'): 'T'}), @@ -81,7 +81,7 @@ def setUpClass(self): @staticmethod def csv(type, rows, header=()): if type == "features.csv": - header = ['Select for...', 'with value...', 'Alias by...', 'Tag', 'Hierarchy', + header = ['Select for...', 'with value...', 'Alias by...', 'Classify as...', 'Hierarchy', 'Strand', "5' End Nucleotide", 'Length', 'Overlap', 'Feature Source'] elif type == "samples.csv": header = ['Input FASTQ Files', 'Sample/Group Name', 'Replicate Number', 'Control', 'Normalization'] diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index fe741193..5cdf38f8 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -136,23 +136,22 @@ def test_ref_tables_single_feature(self): feature_source = {self.short_gff_file: ["sequence_name"]} feature_selector = self.selector_with_template([ # Fails to match due to Identity selector - {'Identity': ("Class", "CSR"), 'Strand': "sense", 'Hierarchy': 1, 'Tag': '', 'nt5end': "all", + {'Identity': ("Class", "CSR"), 'Strand': "sense", 'Hierarchy': 1, 'Class': 'none', 'nt5end': "all", 'Overlap': 'full', 'Length': "20"}, # Match - {'Identity': ("biotype", "snoRNA"), 'Strand': "antisense", 'Hierarchy': 2, 'Tag': '', 'nt5end': "all", + {'Identity': ("biotype", "snoRNA"), 'Strand': "antisense", 'Hierarchy': 2, 'Class': 'tag', 'nt5end': "all", 'Overlap': 'partial', 'Length': "30"} ]) iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, classes, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) - tagged_feat_id = ("Gene:WBGene00023193", '') - self.assertEqual((type(feats), type(alias), type(classes)), (HTSeq.GenomicArrayOfSets, dict, dict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", ''), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) self.assertEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) - self.assertEqual(classes, {"Gene:WBGene00023193": ('additional_class', 'unknown')}) + self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) """Repeating the previous test with all_features=False should produce the same result for this test.""" @@ -161,21 +160,22 @@ def test_ref_tables_single_feature_all_features_false(self): feature_source = {self.short_gff_file: ["sequence_name"]} feature_selector = self.selector_with_template([ # Fails to match due to Identity selector - {'Identity': ("Class", "CSR"), 'Strand': "sense", 'Hierarchy': 1, 'Tag': '', 'nt5end': "all", 'Length': "20", - 'Overlap': 'full'}, + {'Identity': ("Class", "CSR"), 'Strand': "sense", 'Hierarchy': 1, 'Class': 'none', 'nt5end': "all", + 'Overlap': 'full', 'Length': "20"}, # Match - {'Identity': ("biotype", "snoRNA"), 'Strand': "antisense", 'Hierarchy': 2, 'Tag': '', 'nt5end': "all", + {'Identity': ("biotype", "snoRNA"), 'Strand': "antisense", 'Hierarchy': 2, 'Class': 'tag', 'nt5end': "all", 'Overlap': 'partial', 'Length': "30"} ]) iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") + kwargs = {'all_features': True} - feats, alias, classes, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) - self.assertEqual((type(feats), type(alias), type(classes)), (HTSeq.GenomicArrayOfSets, dict, dict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", ''), False, ((1, 2, IntervalPartialMatch(iv)),))}]) - self.assertEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) - self.assertEqual(classes, {"Gene:WBGene00023193": ('additional_class', 'unknown')}) + self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertDictEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) + self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) """Repeating previous test with all_features=False as this yields different results""" @@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self): # Notice: screening for "ID" name attribute happens earlier in counter.load_config() expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")} - _, alias, _, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() + _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() self.assertDictEqual(alias, expected_alias) @@ -262,7 +262,7 @@ def test_ref_tables_identity_matches_multisource_concat(self): set() ] - feats, _, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_matches, expected_matches) @@ -274,7 +274,7 @@ def test_ref_tables_discontinuous_aliases(self): feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template(helpers.rules_template) - _, alias, _, _ = ReferenceTables(feature_source, mock_selector, **kwargs).get() + _, alias, _ = ReferenceTables(feature_source, mock_selector, **kwargs).get() # Ancestor depth of 1, distinct aliases self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name')) @@ -362,7 +362,7 @@ def test_ref_tables_discontinuous_identity_matches(self): {(Sibling, False, (rule3_sib['139:150'], rule2_sib['139:150']))}, set()] - feats, _, _, _ = ReferenceTables(feature_source, feature_selector, **rt_kwargs).get() + feats, _, _ = ReferenceTables(feature_source, feature_selector, **rt_kwargs).get() actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual_steps, expected) @@ -379,7 +379,7 @@ def test_ref_tables_discontinuous_features(self): # being evaluated during stage 2 selection. expected = [set()] - feats, _, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual, expected) @@ -402,12 +402,11 @@ def test_ref_tables_source_filter(self): exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, classes, _ = rt.get() + feats, alias, _ = rt.get() self.assertEqual(alias, exp_alias) self.assertEqual(rt.parents, exp_parents) self.assertEqual(rt.filtered, exp_filtered) - self.assertEqual(classes, exp_classes) self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) self.assertDictEqual(rt.intervals, exp_intervals) self.clear_filters() @@ -430,13 +429,12 @@ def test_ref_tables_type_filter(self): exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, classes, _ = rt.get() + feats, alias, _ = rt.get() self.assertEqual(alias, exp_alias) self.assertEqual(rt.intervals, exp_intervals) self.assertEqual(rt.parents, exp_parents) self.assertEqual(rt.filtered, exp_filtered) - self.assertEqual(classes, exp_classes) self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) self.clear_filters() @@ -449,11 +447,10 @@ def test_ref_tables_both_filter(self): feature_selector = self.selector_with_template(helpers.rules_template) rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, classes, _ = rt.get() + feats, alias, _ = rt.get() self.assertEqual(rt.filtered, {'Child1', 'Child2'}) self.assertEqual(rt.parents, {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}) - self.assertEqual(list(classes.keys()), ['GrandParent', 'Parent2', 'Sibling']) self.assertEqual(list(alias.keys()), ['GrandParent', 'Parent2', 'Sibling']) self.assertEqual(len(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))), 1) # single empty set self.clear_filters() @@ -472,13 +469,12 @@ def test_ref_tables_tagged_match_single(self): feat_id = "Gene:WBGene00023193" feature_source = {f"{resources}/single.gff3": ["sequence_name"]} feature_selector = self.selector_with_template([ - {'Identity': ("ID", feat_id), 'Tag': "tagged_match", 'Hierarchy': 1}, - {'Identity': ("ID", feat_id), 'Tag': "", 'Hierarchy': 2} + {'Identity': ("ID", feat_id), 'Class': "tagged_match", 'Hierarchy': 1}, + {'Identity': ("ID", feat_id), 'Class': "", 'Hierarchy': 2} ]) expected_tags = {feat_id: {(feat_id, "tagged_match"), (feat_id, '')}} expected_aliases = {feat_id: ('Y74C9A.6',)} - expected_classes = {feat_id: ('additional_class', 'unknown')} iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 3746, 3909)) expected_feats = [ set(), { @@ -488,12 +484,11 @@ def test_ref_tables_tagged_match_single(self): set() ] - feats, aliases, classes, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, aliases, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_feats, expected_feats) self.assertDictEqual(aliases, expected_aliases) - self.assertDictEqual(classes, expected_classes) self.assertDictEqual(tags, expected_tags) """Does ReferenceTables.get() correctly merge records for discontinuous features matching multiple tagged rules?""" @@ -503,14 +498,13 @@ def test_ref_tables_tagged_match_merging(self): # All rules match the same root feature feature_selector = self.selector_with_template([ - {'Identity': ("UniqueAttr", "child2"), 'Tag': "shared", 'Hierarchy': 1}, - {'Identity': ("ID", "Parent2"), 'Tag': "shared", 'Hierarchy': 2}, - {'Identity': ("ID", "Child2"), 'Tag': "", 'Hierarchy': 3} + {'Identity': ("UniqueAttr", "child2"), 'Class': "shared", 'Hierarchy': 1}, + {'Identity': ("ID", "Parent2"), 'Class': "shared", 'Hierarchy': 2}, + {'Identity': ("ID", "Child2"), 'Class': "", 'Hierarchy': 3} ]) expected_tags = {'Parent2': {('Parent2', 'shared'), ('Parent2', '')}} expected_aliases = {'Parent2': ('Child2Name', 'Parent2Name')} - expected_classes = {'Parent2': ('NA',)} Parent2_iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 19, 30)) Child2_iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 39, 50)) @@ -526,12 +520,11 @@ def test_ref_tables_tagged_match_merging(self): set() ] - feats, aliases, classes, tags = ReferenceTables(feature_source, feature_selector).get() + feats, aliases, tags = ReferenceTables(feature_source, feature_selector).get() stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(stepvec, expected_feats) self.assertDictEqual(aliases, expected_aliases) - self.assertDictEqual(classes, expected_classes) self.assertDictEqual(tags, expected_tags) """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" From a2fd95578a07f3b9db03778cb0513213df38fd73 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 21 Oct 2022 17:35:23 -0700 Subject: [PATCH 16/16] Slight refactor to move ReferenceTables.finalize_tables() back into ReferenceTables.get(). The get() function is significantly shorter after the recent changes for GFF validation, so it can accommodate the finalization routine without becoming too crowded. --- tiny/rna/counter/hts_parsing.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index a3ee92e9..170d89c3 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -429,8 +429,8 @@ def parse_gff(file, row_fn: Callable, alias_keys=None): # Type aliases for human readability AliasTable = DefaultDict[str, Tuple[str]] +TagTable = DefaultDict[str, Set[Tuple[str, str]]] GenomicArray = HTSeq.GenomicArrayOfSets -Tags = DefaultDict[str, Set[str]] class ReferenceTables: @@ -478,13 +478,20 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string)) @report_execution_time("GFF parsing") - def get(self) -> Tuple[GenomicArray, AliasTable, dict]: - """Initiates GFF parsing and returns the resulting reference tables""" + def get(self) -> Tuple[GenomicArray, AliasTable, TagTable]: + """Initiates GFF parsing and returns complete feature, alias, and tag tables""" for file, alias_keys in self.gff_files.items(): parse_gff(file, self.parse_row, alias_keys=alias_keys) - return self.finalize_tables() + self._finalize_aliases() + self._finalize_features() + + if self.get_feats_table_size() == 0 and self.all_features is False: + raise ValueError("No features were retained while parsing your GFF file.\n" + "This may be due to a lack of features matching 'Select for...with value...'") + + return self.feats, self.alias, self.tags def parse_row(self, row, alias_keys=None): if row.type.lower() == "chromosome" or not self.filter_match(row): @@ -610,19 +617,6 @@ def exclude_row(self, row): self.parents[feature_id] = self.get_row_parent(feature_id, row.attr) self.chrom_vector_setdefault(row.iv.chrom) - def finalize_tables(self) -> Tuple[GenomicArray, AliasTable, dict]: - """Convert sets to sorted tuples for performance, hashability, and deterministic outputs - Matches must also be assigned to intervals now that all intervals are known""" - - self._finalize_aliases() - self._finalize_features() - - if self.get_feats_table_size() == 0 and self.all_features is False: - raise ValueError("No features were retained while parsing your GFF file.\n" - "This may be due to a lack of features matching 'Select for...with value...'") - - return self.feats, self.alias, self.tags - def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()}