From e77c243aae57a96320f6ae44c3ea32d9742257ef Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 15 Oct 2022 17:32:28 -0700
Subject: [PATCH 01/16] Changing the internal and external names of the Tag
column
---
START_HERE/features.csv | 2 +-
tiny/rna/configuration.py | 2 +-
tiny/rna/counter/counter.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/START_HERE/features.csv b/START_HERE/features.csv
index 959a1e73..bd2cc089 100755
--- a/START_HERE/features.csv
+++ b/START_HERE/features.csv
@@ -1,4 +1,4 @@
-Select for...,with value...,Alias by...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source
+Select for...,with value...,Alias by...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source
Class,mask,Alias,,1,both,all,all,Partial,./reference_data/ram1.gff3
Class,miRNA,Alias,,2,sense,all,16-22,Full,./reference_data/ram1.gff3
Class,piRNA,Alias,5pA,2,both,A,24-32,Full,./reference_data/ram1.gff3
diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
index dde86823..b4c18c33 100644
--- a/tiny/rna/configuration.py
+++ b/tiny/rna/configuration.py
@@ -365,7 +365,7 @@ class CSVReader(csv.DictReader):
"Select for...": "Key",
"with value...": "Value",
"Alias by...": "Name",
- "Tag": "Tag",
+ "Classify as...": "Class",
"Hierarchy": "Hierarchy",
"Strand": "Strand",
"5' End Nucleotide": "nt5end",
diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py
index c7f48f0b..da90e2b7 100644
--- a/tiny/rna/counter/counter.py
+++ b/tiny/rna/counter/counter.py
@@ -143,7 +143,7 @@ def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[
rules, gff_files = list(), defaultdict(list)
for row in CSVReader(features_csv, "Features Sheet").rows():
- rule = {col: row[col] for col in ["Tag", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]}
+ rule = {col: row[col] for col in ["Class", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]}
rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base
rule['Identity'] = (row['Key'], row['Value']) # Create identity tuple
rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number
From bc2adac643f8a1a921ec7b7b0e1eb2913207f95c Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 15 Oct 2022 17:36:46 -0700
Subject: [PATCH 02/16] First draft changes in tiny-count
Removing the accounting of the Class= attribute from ReferenceTables, usages of this output in the Features data class, and the FeatureCounts output file class. Also correcting some erroneous use of the StepVector typehint (this should have been GenomicArray).
The Feature Class column has been removed from the output feature_counts.csv table, and the Tag column has been renamed to "Classifier"
---
tiny/rna/counter/features.py | 4 +---
tiny/rna/counter/hts_parsing.py | 39 +++++++++++++--------------------
tiny/rna/counter/statistics.py | 6 +----
3 files changed, 17 insertions(+), 32 deletions(-)
diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index 675d5c45..63fe4191 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -17,13 +17,11 @@
class Features(metaclass=Singleton):
chrom_vectors: HTSeq.ChromVector
aliases: dict
- classes: dict
tags: dict
- def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict, tags: dict):
+ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, tags: dict):
Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups
Features.aliases = aliases # For feature ID -> preferred feature name lookups
- Features.classes = classes # For feature ID -> class lookups
Features.tags = tags # For feature ID -> match IDs
diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 4e577285..9e49c9de 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -428,8 +428,8 @@ def parse_gff(file, row_fn: Callable, alias_keys=None):
# Type aliases for human readability
-ClassTable = AliasTable = DefaultDict[str, Tuple[str]]
-StepVector = HTSeq.GenomicArrayOfSets
+AliasTable = DefaultDict[str, Tuple[str]]
+GenomicArray = HTSeq.GenomicArrayOfSets
Tags = DefaultDict[str, Set[str]]
@@ -471,7 +471,6 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs):
self.parents, self.filtered = {}, set() # Original Feature ID
self.intervals = defaultdict(list) # Root Feature ID
self.matches = defaultdict(set) # Root Match ID
- self.classes = defaultdict(set) # Root Feature ID
self.alias = defaultdict(set) # Root Feature ID
self.tags = defaultdict(set) # Root Feature ID -> Root Match ID
@@ -479,7 +478,7 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs):
setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string))
@report_execution_time("GFF parsing")
- def get(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]:
+ def get(self) -> Tuple[GenomicArray, AliasTable, dict]:
"""Initiates GFF parsing and returns the resulting reference tables"""
for file, alias_keys in self.gff_files.items():
@@ -494,14 +493,14 @@ def parse_row(self, row, alias_keys=None):
# Grab the primary key for this feature
feature_id = self.get_feature_id(row)
- # Get feature's classes and identity match tuples
- matches, classes = self.get_matches_and_classes(row.attr)
+ # Perform Stage 1 selection
+ matches = self.get_matches(row.attr)
# Only add features with identity matches if all_features is False
if not self.all_features and not len(matches):
self.exclude_row(row)
return
# Add feature data to root ancestor in the reference tables
- root_id = self.add_feature(feature_id, row, matches, classes)
+ root_id = self.add_feature(feature_id, row, matches)
# Add alias to root ancestor if it is unique
self.add_alias(root_id, alias_keys, row.attr)
@@ -531,14 +530,13 @@ def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs
return lineage
- def add_feature(self, feature_id: str, row, matches: defaultdict, classes: set) -> str:
- """Adds the feature to classes and intervals tables under its root ID, and to the matches table
+ def add_feature(self, feature_id: str, row, matches: defaultdict) -> str:
+ """Adds the feature to the intervals table under its root ID, and to the matches table
under its tagged ID. Note: matches are later assigned to intervals in _finalize_features()"""
lineage = self.get_feature_ancestors(feature_id, row.attr)
root_id = self.get_root_feature(lineage)
- self.classes[root_id] |= classes
if row.iv not in self.intervals[root_id]:
self.intervals[root_id].append(row.iv)
@@ -562,25 +560,22 @@ def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensit
for row_val in row_attrs.get(alias_key, ()):
self.alias[root_id].add(row_val)
- def get_matches_and_classes(self, row_attrs: CaseInsensitiveAttrs) -> Tuple[DefaultDict, set]:
- """Grabs classes and match tuples from attributes that match identity rules (Stage 1 Selection)"""
-
- row_attrs.setdefault("Class", ("_UNKNOWN_",))
- classes = {c for c in row_attrs["Class"]}
+ def get_matches(self, row_attrs: CaseInsensitiveAttrs) -> DefaultDict:
+ """Performs Stage 1 selection and returns match tuples under their associated classifier"""
identity_matches = defaultdict(set)
for ident, rule_indexes in self.selector.inv_ident.items():
if row_attrs.contains_ident(ident):
for index in rule_indexes:
# Non-tagged matches are pooled under '' empty string
- tag = self.selector.rules_table[index]['Tag']
+ tag = self.selector.rules_table[index]['Class']
identity_matches[tag].add((
index,
self.selector.rules_table[index]['Hierarchy'],
self.selector.rules_table[index]['Overlap']
))
# -> identity_matches: {tag: (rule, rank, overlap), ...}
- return identity_matches, classes
+ return identity_matches
def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str:
"""Get the current feature's parent while cooperating with filtered features"""
@@ -615,11 +610,10 @@ def exclude_row(self, row):
self.parents[feature_id] = self.get_row_parent(feature_id, row.attr)
self.chrom_vector_setdefault(row.iv.chrom)
- def finalize_tables(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]:
+ def finalize_tables(self) -> Tuple[GenomicArray, AliasTable, dict]:
"""Convert sets to sorted tuples for performance, hashability, and deterministic outputs
Matches must also be assigned to intervals now that all intervals are known"""
- self._finalize_classes()
self._finalize_aliases()
self._finalize_features()
@@ -627,10 +621,10 @@ def finalize_tables(self) -> Tuple[StepVector, AliasTable, ClassTable, dict]:
raise ValueError("No features were retained while parsing your GFF file.\n"
"This may be due to a lack of features matching 'Select for...with value...'")
- return self.feats, self.alias, self.classes, self.tags
+ return self.feats, self.alias, self.tags
def _finalize_features(self):
- """Assigns matches to their corresponding intervals by populating StepVector with match tuples"""
+ """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples"""
for root_id, unmerged_sub_ivs in self.intervals.items():
merged_sub_ivs = self._merge_adjacent_subintervals(unmerged_sub_ivs)
@@ -676,9 +670,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval])
def _finalize_aliases(self):
self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()}
- def _finalize_classes(self):
- self.classes = {feat: tuple(sorted(classes, key=str.lower)) for feat, classes in self.classes.items()}
-
def get_feats_table_size(self) -> int:
"""Returns the sum of features across all chromosomes and strands"""
diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
index 8d625564..33527154 100644
--- a/tiny/rna/counter/statistics.py
+++ b/tiny/rna/counter/statistics.py
@@ -186,7 +186,6 @@ def print_warnings(self):
class FeatureCounts(MergedStat):
def __init__(self, Features_obj):
self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.tags.values()))
- self.classes = Features_obj.classes
self.aliases = Features_obj.aliases
self.norm_prefs = {}
@@ -220,11 +219,8 @@ def write_feature_counts(self) -> pd.DataFrame:
summary = self.sort_cols_and_round(self.feat_counts_df)
# Add Feature Name column, which is the feature alias (default is blank if no alias exists)
summary.insert(0, "Feature Name", summary.index.map(lambda feat: ', '.join(self.aliases.get(feat[0], ''))))
- # Add Classes column for classes associated with the given feature
- feat_class_map = lambda feat: ', '.join(self.classes[feat[0]])
- summary.insert(1, "Feature Class", summary.index.map(feat_class_map))
# Sort by index, make index its own column, and rename it to Feature ID
- summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Tag"})
+ summary = summary.sort_index().reset_index().rename(columns={"level_0": "Feature ID", "level_1": "Classifier"})
summary.to_csv(self.prefix + '_feature_counts.csv', index=False)
return summary
From 5f24e6c19d9db799ab597c9a6c2a749aa6d80407 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Sat, 15 Oct 2022 17:51:33 -0700
Subject: [PATCH 03/16] tiny-deseq.r has been updated to handle feature counts
using the new table format. The Feature Class column has been dropped and the
Tag column has been renamed to Classifier.
Also further improved code flexibility for tiny-deseq.r by not hardcoding character columns when calling write.csv
---
tiny/rna/tiny-deseq.r | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r
index 07ef86e1..391e2a3d 100755
--- a/tiny/rna/tiny-deseq.r
+++ b/tiny/rna/tiny-deseq.r
@@ -79,8 +79,8 @@ with_names <- function(vec){
}
## Original column names
-idx_cols <- with_names(c('Feature ID', 'Tag'))
-meta_cols <- with_names(c('Feature Name', 'Feature Class'))
+idx_cols <- with_names(c('Feature ID', 'Classifier'))
+meta_cols <- with_names('Feature Name')
char_cols <- with_names(unique(c(idx_cols, meta_cols)))
sample_cols <- with_names(colnames(header_row)[!(colnames(header_row) %in% char_cols)])
@@ -168,13 +168,13 @@ if (plot_pca){
}
## Get normalized counts and write them to CSV with original sample names in header
-deseq_counts <- df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE))
-colnames(deseq_counts) <- c(meta_cols, sample_cols)
+deseq_counts <- restore_multiindex(df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE)))
+colnames(deseq_counts) <- c(idx_cols, meta_cols, sample_cols)
write.csv(
- restore_multiindex(deseq_counts),
+ deseq_counts,
paste(out_pref, "norm_counts.csv", sep="_"),
row.names=FALSE,
- quote=1:4,
+ quote=which(colnames(deseq_counts) %in% char_cols),
)
if (has_control){
@@ -190,15 +190,16 @@ if (has_control){
}
write_dge_table <- function (dge_df, cond1, cond2){
+ dge_df <- restore_multiindex(dge_df)
write.csv(
- restore_multiindex(dge_df),
+ dge_df,
paste(out_pref,
"cond1", cond1,
"cond2", cond2,
"deseq_table.csv",
sep="_"),
row.names=FALSE,
- quote=1:4,
+ quote=which(colnames(dge_df) %in% char_cols),
)
}
From dace53d54b0f0614118f3ae854599eedeec4f1da Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Mon, 17 Oct 2022 18:44:30 -0700
Subject: [PATCH 04/16] class_charts and scatter_by_dge_class have been updated
to treat the Tag multiindex column as a classifier. Wow, this opened the door
for some really satisfying simplifications to the class-related codebase.
Backward compatibility is not offered with this commit. I've been thinking about how to reconcile this but I think ultimately that will be a bad idea. Number one, the counting semantics of the old Feature Class vs Tag columns are completely different; these inputs wouldn't be interchangeable.
I've also removed the show_unknown option from scatter_dges() because we have yet to use this feature and we've never added the option to any user-facing config files
---
tiny/rna/plotter.py | 139 +++++++++++---------------------------------
1 file changed, 34 insertions(+), 105 deletions(-)
diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py
index 7ab2ca14..e6ded67d 100644
--- a/tiny/rna/plotter.py
+++ b/tiny/rna/plotter.py
@@ -324,48 +324,35 @@ def load_dge_tables(comparisons: list) -> pd.DataFrame:
print("Warning: multiple conditions matched in DGE filename. Using first match.")
comparison_name = "_vs_".join(comparison[0])
- table = pd.read_csv(dgefile).rename({"Unnamed: 0": "Feature ID"}, axis="columns")
- table = table.set_index(
- ["Feature ID", "Tag"]
- # For backward compatability
- if "Tag" in table.columns
- else "Feature ID")
+ table = set_counts_table_multiindex(pd.read_csv(dgefile))
de_table[comparison_name] = table['padj']
return de_table
-def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, show_unknown=True, pval=0.05):
+def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0.05):
"""Creates PDFs of all pairwise comparison scatter plots from a count table.
Can highlight classes and/or differentially expressed genes as different colors.
Args:
- count_df: A dataframe of counts per feature
+ count_df: A dataframe of counts per feature with multiindex (feature ID, classifier)
dges: A dataframe of differential gene table output to highlight
view_lims: A tuple of (min, max) data bounds for the plot view
output_prefix: A string to use as a prefix for saving files
- classes: An optional dataframe of class(es) per feature. If provided, points are grouped by class
- show_unknown: If true, class "unknown" will be included if highlighting by classes
+ classes: An optional feature-class multiindex. If provided, points are grouped by class
pval: The pvalue threshold for determining the outgroup
"""
if classes is not None:
- uniq_classes = sorted(list(pd.unique(classes)), key=str.lower)
+ uniq_classes = sorted(pd.unique(classes.get_level_values(1)), key=str.lower)
class_colors = aqplt.assign_class_colors(uniq_classes)
aqplt.set_dge_class_legend_style()
- if not show_unknown and 'unknown' in uniq_classes:
- uniq_classes.remove('unknown')
-
for pair in dges:
p1, p2 = pair.split("_vs_")
- # Get list of differentially expressed features for this comparison pair
- dge_list = list(dges.index[dges[pair] < pval])
- # Create series of feature -> class relationships
- class_dges = classes.loc[dge_list]
- # Gather lists of features by class (listed in order corresponding to unique_classes)
- grp_args = [class_dges.index[class_dges == cls].tolist() for cls in uniq_classes]
+ dge_dict = (dges[pair] < pval).groupby(level=1).groups
+ grp_args = [dge_dict[cls] for cls in uniq_classes]
labels = uniq_classes
sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args, colors=class_colors,
@@ -400,15 +387,10 @@ def load_raw_counts(raw_counts_file: str) -> pd.DataFrame:
Args:
raw_counts_file: The raw counts CSV produced by tiny-count
Returns:
- The raw counts DataFrame (multiindex if it contains a Tag column)
+ The raw counts DataFrame with a multiindex of (feature id, classifier)
"""
- raw_counts = pd.read_csv(raw_counts_file)
- return tokenize_feature_classes(raw_counts)\
- .set_index(["Feature ID", "Tag"]
- # For backward compatability
- if "Tag" in raw_counts.columns
- else "Feature ID")
+ return set_counts_table_multiindex(pd.read_csv(raw_counts_file))
def load_rule_counts(rule_counts_file: str) -> pd.DataFrame:
@@ -416,7 +398,7 @@ def load_rule_counts(rule_counts_file: str) -> pd.DataFrame:
Args:
rule_counts_file: The rule counts CSV produced by tiny-count
Returns:
- The rule counts DataFrame
+ The raw counts DataFrame with a multiindex of (feature id, classifier)
"""
return pd.read_csv(rule_counts_file, index_col=0)
@@ -427,104 +409,51 @@ def load_norm_counts(norm_counts_file: str) -> pd.DataFrame:
Args:
norm_counts_file: The norm counts CSV produced by DESeq2
Returns:
- The norm counts DataFrame (multiindex if it contains a Tag column)
+ The raw counts DataFrame with a multiindex of (feature id, classifier)
"""
- norm_counts = pd.read_csv(norm_counts_file)\
- .rename({"Unnamed: 0": "Feature ID"}, axis="columns")
- return tokenize_feature_classes(norm_counts)\
- .set_index(["Feature ID", "Tag"]
- # For backward compatability
- if "Tag" in norm_counts.columns
- else "Feature ID")
+ return set_counts_table_multiindex(pd.read_csv(norm_counts_file))
-def tokenize_feature_classes(counts_df: pd.DataFrame) -> pd.DataFrame:
- """Parses each Feature Class element into a list. The resulting "lists" are
- tuples to allow for .groupby() operations
- Args:
- counts_df: a DataFrame containing Feature Class list strings
- Returns: a new DataFrame with tokenized Feature Class elements
- """
-
- tokenized = counts_df.copy()
- tokenized["Feature Class"] = (
- counts_df["Feature Class"]
- .str.split(',')
- .apply(lambda classes: tuple(c.strip() for c in classes))
- )
+def set_counts_table_multiindex(counts: pd.DataFrame, fillna='_UNKNOWN_') -> pd.DataFrame:
+ """Assigns a multiindex composed of (Feature ID, Classifier) to the counts table,
+ and fills NaN values in the classifier"""
- return tokenized
+ level0 = "Feature ID"
+ level1 = "Classifier"
+ counts[level1] = counts[level1].fillna(fillna)
+ return counts.set_index([level0, level1])
-def get_flat_classes(counts_df: pd.DataFrame) -> pd.Series:
- """Features with multiple associated classes must have these classes flattened
- Args:
- counts_df: A counts DataFrame with a list-parsed Feature Class column
- Returns:
- A Series with the same index as the input and (optionally tagged) single classes
- per index entry. Index repetitions are allowed for accommodating all classes.
- """
- # For backward compatability
- if isinstance(counts_df.index, pd.MultiIndex):
- counts_df["Feature Class"] = counts_df.apply(tag_classes, axis="columns")
+def get_flat_classes(counts_df: pd.DataFrame) -> pd.Index:
+ """Features with multiple associated classes are returned in flattened form
+ with one class per entry, yielding multiple entries for these features. During
+ earlier versions this required some processing, but now we can simply return
+ the multiindex of the counts_df.
- return counts_df["Feature Class"].explode()
-
-
-def tag_classes(row: pd.Series) -> tuple:
- """Appends a feature's tag to its (tokenized) classes, if present.
- Intended for use with df.apply()
Args:
- row: a series, as provided by .apply(... , axis="columns")
- Returns: a tuple of tagged classes for a feature
+ counts_df: A DataFrame with a multiindex of (feature ID, feature class)
+ Returns:
+ The counts_df multiindex
"""
- # For backward compatability
- tag = row.name[1] if type(row.name) is tuple else pd.NA
- return tuple(
- cls if pd.isna(tag) else f"{cls} ({tag})"
- for cls in row["Feature Class"]
- )
+ return counts_df.index
def get_class_counts(raw_counts_df: pd.DataFrame) -> pd.DataFrame:
- """Calculates class counts from the Raw Counts DataFrame
-
- If there are multiple classes associated with a feature, then that feature's
- counts are normalized across all associated classes.
+ """Calculates class counts from level 1 of the raw counts multiindex
Args:
- raw_counts_df: A DataFrame containing features and their associated classes and counts
+ raw_counts_df: A DataFrame with a multiindex of (feature ID, classifier)
Returns:
class_counts: A DataFrame with an index of classes and count entries, per comparison
"""
- # 1. Tokenize Feature Class lists and label each with the feature's Tag (if provided)
- tokenized = raw_counts_df.copy().drop("Feature Name", axis="columns", errors='ignore')
- tokenized['Feature Class'] = raw_counts_df.apply(tag_classes, axis="columns")
-
- # 2. Group and sum by Feature Class. Prepare for 3. by dividing by group's class count.
- grouped = (tokenized.groupby("Feature Class")
- .apply(lambda grp: grp.sum() / len(grp.name)))
-
- """
- # Sanity check!
- tokenized.groupby("Feature Class").apply(
- lambda x: print('\n' + str(x.name) + '\n'
- f"{x.iloc[:,1:].sum()} / {len(x.name)} =" + '\n' +
- f"{x.iloc[:,1:].sum() / len(x.name)}" + '\n' +
- f"Total: {(x.iloc[:,1:].sum() / len(x.name)).sum()}"
- )
- )"""
-
- # 3. Flatten class lists, regroup and sum by the normalized group counts from 2.
- class_counts = grouped.reset_index().explode("Feature Class").groupby("Feature Class").sum()
-
- # Sanity check. Tolerance of 1 is overprovisioned for floating point errors
- assert -1 < (class_counts.sum().sum() - raw_counts_df.iloc[:,2:].sum().sum()) < 1
- return class_counts
+ return (raw_counts_df
+ .drop("Feature Name", axis="columns", errors='ignore')
+ .groupby(level=1)
+ .sum())
def validate_inputs(args: argparse.Namespace) -> None:
From eef300a169d7cfc5e912f8ad7c7414257f7c1620 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 18 Oct 2022 12:16:04 -0700
Subject: [PATCH 05/16] Renaming the "tags" attribute in the Features class to
"classes" for consistency.
---
tiny/rna/counter/features.py | 6 +++---
tiny/rna/counter/hts_parsing.py | 14 +++++++-------
tiny/rna/counter/statistics.py | 2 +-
3 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py
index 63fe4191..44809d08 100644
--- a/tiny/rna/counter/features.py
+++ b/tiny/rna/counter/features.py
@@ -17,12 +17,12 @@
class Features(metaclass=Singleton):
chrom_vectors: HTSeq.ChromVector
aliases: dict
- tags: dict
+ classes: dict
- def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, tags: dict):
+ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict):
Features.chrom_vectors = features.chrom_vectors # For interval -> feature record tuple lookups
Features.aliases = aliases # For feature ID -> preferred feature name lookups
- Features.tags = tags # For feature ID -> match IDs
+ Features.classes = classes # For feature ID -> match IDs
class FeatureCounter:
diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py
index 9e49c9de..a3ee92e9 100644
--- a/tiny/rna/counter/hts_parsing.py
+++ b/tiny/rna/counter/hts_parsing.py
@@ -495,7 +495,7 @@ def parse_row(self, row, alias_keys=None):
feature_id = self.get_feature_id(row)
# Perform Stage 1 selection
matches = self.get_matches(row.attr)
- # Only add features with identity matches if all_features is False
+ # Skip features that lack matches unless all_features is True
if not self.all_features and not len(matches):
self.exclude_row(row)
return
@@ -567,9 +567,9 @@ def get_matches(self, row_attrs: CaseInsensitiveAttrs) -> DefaultDict:
for ident, rule_indexes in self.selector.inv_ident.items():
if row_attrs.contains_ident(ident):
for index in rule_indexes:
- # Non-tagged matches are pooled under '' empty string
- tag = self.selector.rules_table[index]['Class']
- identity_matches[tag].add((
+ # Unclassified matches are pooled under '' empty string
+ classifier = self.selector.rules_table[index]['Class']
+ identity_matches[classifier].add((
index,
self.selector.rules_table[index]['Hierarchy'],
self.selector.rules_table[index]['Overlap']
@@ -623,6 +623,9 @@ def finalize_tables(self) -> Tuple[GenomicArray, AliasTable, dict]:
return self.feats, self.alias, self.tags
+ def _finalize_aliases(self):
+ self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()}
+
def _finalize_features(self):
"""Assigns matches to their corresponding intervals by populating GenomicArray with match tuples"""
@@ -667,9 +670,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval])
return merged_ivs
- def _finalize_aliases(self):
- self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()}
-
def get_feats_table_size(self) -> int:
"""Returns the sum of features across all chromosomes and strands"""
diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py
index 33527154..24c73cbf 100644
--- a/tiny/rna/counter/statistics.py
+++ b/tiny/rna/counter/statistics.py
@@ -185,7 +185,7 @@ def print_warnings(self):
class FeatureCounts(MergedStat):
def __init__(self, Features_obj):
- self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.tags.values()))
+ self.feat_counts_df = pd.DataFrame(index=set.union(*Features_obj.classes.values()))
self.aliases = Features_obj.aliases
self.norm_prefs = {}
From 138ad24d94ecb4f8a88bd0ea3da3235e7de3da2a Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 18 Oct 2022 13:43:56 -0700
Subject: [PATCH 06/16] Adding two new command line options to tiny-plot:
--unassigned-class-label and --unknown-class-label. These options retain
their previous default values of _UNASSIGNED_ and _UNKNOWN_.
Class labels in class_charts are now sorted.
---
tiny/rna/plotter.py | 67 ++++++++++++++++++++++++++-------------------
1 file changed, 39 insertions(+), 28 deletions(-)
diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py
index e6ded67d..59a9b2ea 100644
--- a/tiny/rna/plotter.py
+++ b/tiny/rna/plotter.py
@@ -6,7 +6,6 @@
"""
import multiprocessing as mp
import pandas as pd
-import numpy as np
import itertools
import traceback
import argparse
@@ -15,13 +14,16 @@
import re
from collections import defaultdict
-from typing import Optional, Dict, Union, Tuple, DefaultDict
+from typing import Dict, Union, Tuple, DefaultDict
from pkg_resources import resource_filename
from tiny.rna.configuration import timestamp_format
-from tiny.rna.plotterlib import plotterlib as lib
+from tiny.rna.plotterlib import plotterlib
from tiny.rna.util import report_execution_time, make_filename, SmartFormatter
+aqplt: plotterlib
+RASTER: bool
+
def get_args():
"""Get input arguments from the user/command line."""
@@ -65,7 +67,11 @@ def get_args():
help='len_dist plots will start at this value')
optional_args.add_argument('-lda', '--len-dist-max', metavar='VALUE', type=int,
help='len_dist plots will end at this value')
-
+ optional_args.add_argument('-una', '--unassigned-class', metavar='LABEL', default='_UNASSIGNED_',
+ help='Use this label in class-related plots for unassigned counts'),
+ optional_args.add_argument('-unk', '--unknown-class', metavar='LABEL', default='_UNKNOWN_',
+ help='Use this label in class-related plots for counts which were '
+ 'assigned by rules lacking a "Classify as..." value')
# Required arguments
required_args.add_argument('-p', '--plots', metavar='PLOT', required=True, nargs='+',
@@ -155,17 +161,19 @@ def get_len_dist_dict(files_list: list) -> DefaultDict[str, Dict[str, pd.DataFra
return matrices
-def class_charts(raw_class_counts: pd.DataFrame, mapped_reads: pd.Series, out_prefix: str, scale=2, **kwargs):
+def class_charts(raw_class_counts: pd.DataFrame, mapped_reads: pd.Series, out_prefix: str, class_na: str, scale=2, **kwargs):
"""Create a PDF of the proportion of raw assigned counts vs total mapped reads for each class
Args:
raw_class_counts: A dataframe containing RAW class counts per library
mapped_reads: A Series containing the total number of mapped reads per library
out_prefix: The prefix to use when naming output PDF plots
+ class_na: The label to use for representing unassigned reads
+ scale: The decimal scale for table percentages, and for determining inclusion of "unassigned"
kwargs: Additional keyword arguments to pass to pandas.DataFrame.plot()
"""
- class_props = get_proportions_df(raw_class_counts, mapped_reads, "_UNASSIGNED_", scale)
+ class_props = get_proportions_df(raw_class_counts, mapped_reads, class_na, scale).sort_index()
max_prop = class_props.max().max()
for library in raw_class_counts:
@@ -248,7 +256,7 @@ def get_sample_rep_dict(df: pd.DataFrame) -> dict:
"""
sample_dict = defaultdict(list)
- non_numeric_cols = ["Feature Class", "Feature Name"]
+ non_numeric_cols = ["Feature Name"]
for col in df.columns:
if col in non_numeric_cols: continue
@@ -302,12 +310,13 @@ def scatter_replicates(count_df: pd.DataFrame, samples: dict, output_prefix: str
rscat.figure.savefig(pdf_name)
-def load_dge_tables(comparisons: list) -> pd.DataFrame:
+def load_dge_tables(comparisons: list, class_fillna: str) -> pd.DataFrame:
"""Creates a new dataframe containing all features and padj values for each comparison
from a list of DGE tables.
Args:
comparisons: DGE tables (files) produced by DESeq2
+ class_fillna: the value to fill for NaN values in multiindex level 1
Returns:
de_table: A single table of padj values per feature/comparison
@@ -324,7 +333,7 @@ def load_dge_tables(comparisons: list) -> pd.DataFrame:
print("Warning: multiple conditions matched in DGE filename. Using first match.")
comparison_name = "_vs_".join(comparison[0])
- table = set_counts_table_multiindex(pd.read_csv(dgefile))
+ table = set_counts_table_multiindex(pd.read_csv(dgefile), class_fillna)
de_table[comparison_name] = table['padj']
@@ -355,8 +364,9 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0.
grp_args = [dge_dict[cls] for cls in uniq_classes]
labels = uniq_classes
- sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args, colors=class_colors,
- pval=pval, view_lims=view_lims, labels=labels, rasterized=RASTER)
+ sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], *grp_args,
+ colors=class_colors, pval=pval, view_lims=view_lims, labels=labels,
+ rasterized=RASTER)
sscat.set_title('%s vs %s' % (p1, p2))
sscat.set_xlabel("Log$_{2}$ normalized reads in " + p1)
@@ -372,8 +382,9 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0.
labels = ['p < %g' % pval]
colors = aqplt.assign_class_colors(labels)
- sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], grp_args, colors=colors, alpha=0.5,
- pval=pval, view_lims=view_lims, labels=labels, rasterized=RASTER)
+ sscat = aqplt.scatter_grouped(count_df.loc[:, p1], count_df.loc[:, p2], grp_args,
+ colors=colors, alpha=0.5, pval=pval, view_lims=view_lims, labels=labels,
+ rasterized=RASTER)
sscat.set_title('%s vs %s' % (p1, p2))
sscat.set_xlabel("Log$_{2}$ normalized reads in " + p1)
@@ -382,15 +393,16 @@ def scatter_dges(count_df, dges, output_prefix, view_lims, classes=None, pval=0.
sscat.figure.savefig(pdf_name)
-def load_raw_counts(raw_counts_file: str) -> pd.DataFrame:
+def load_raw_counts(raw_counts_file: str, class_fillna: str) -> pd.DataFrame:
"""Loads a raw_counts CSV as a DataFrame
Args:
raw_counts_file: The raw counts CSV produced by tiny-count
+ class_fillna: the value to fill for NaN values in multiindex level 1
Returns:
The raw counts DataFrame with a multiindex of (feature id, classifier)
"""
- return set_counts_table_multiindex(pd.read_csv(raw_counts_file))
+ return set_counts_table_multiindex(pd.read_csv(raw_counts_file), class_fillna)
def load_rule_counts(rule_counts_file: str) -> pd.DataFrame:
@@ -404,20 +416,21 @@ def load_rule_counts(rule_counts_file: str) -> pd.DataFrame:
return pd.read_csv(rule_counts_file, index_col=0)
-def load_norm_counts(norm_counts_file: str) -> pd.DataFrame:
+def load_norm_counts(norm_counts_file: str, class_fillna: str) -> pd.DataFrame:
"""Loads a norm_counts CSV as a DataFrame
Args:
norm_counts_file: The norm counts CSV produced by DESeq2
+ class_fillna: the value to fill for NaN values in multiindex level 1
Returns:
The raw counts DataFrame with a multiindex of (feature id, classifier)
"""
- return set_counts_table_multiindex(pd.read_csv(norm_counts_file))
+ return set_counts_table_multiindex(pd.read_csv(norm_counts_file), class_fillna)
-def set_counts_table_multiindex(counts: pd.DataFrame, fillna='_UNKNOWN_') -> pd.DataFrame:
+def set_counts_table_multiindex(counts: pd.DataFrame, fillna: str) -> pd.DataFrame:
"""Assigns a multiindex composed of (Feature ID, Classifier) to the counts table,
- and fills NaN values in the classifier"""
+ and fills NaN values in the classifier column"""
level0 = "Feature ID"
level1 = "Classifier"
@@ -530,11 +543,11 @@ def setup(args: argparse.Namespace) -> dict:
fetched: Dict[str, Union[pd.DataFrame, pd.Series, dict, None]] = {}
input_getters = {
'ld_matrices_dict': lambda: get_len_dist_dict(args.len_dist),
- 'raw_counts_df': lambda: load_raw_counts(args.raw_counts),
'mapped_reads_ds': lambda: load_mapped_reads(args.summary_stats),
- 'norm_counts_df': lambda: load_norm_counts(args.norm_counts),
'rule_counts_df': lambda: load_rule_counts(args.rule_counts),
- 'de_table_df': lambda: load_dge_tables(args.dge_tables),
+ 'norm_counts_df': lambda: load_norm_counts(args.norm_counts, args.unknown_class),
+ 'raw_counts_df': lambda: load_raw_counts(args.raw_counts, args.unknown_class),
+ 'de_table_df': lambda: load_dge_tables(args.dge_tables, args.unknown_class),
'sample_rep_dict': lambda: get_sample_rep_dict(fetched["norm_counts_df"]),
'norm_counts_avg_df': lambda: get_sample_averages(fetched["norm_counts_df"], fetched["sample_rep_dict"]),
'feat_classes_df': lambda: get_flat_classes(fetched["norm_counts_df"]),
@@ -554,14 +567,12 @@ def setup(args: argparse.Namespace) -> dict:
@report_execution_time("Plotter runtime")
def main():
- """
- Main routine
- """
+
args = get_args()
validate_inputs(args)
global aqplt, RASTER
- aqplt = lib(args.style_sheet)
+ aqplt = plotterlib(args.style_sheet)
RASTER = not args.vector_scatter
inputs = setup(args)
@@ -576,7 +587,7 @@ def main():
kwd = {}
elif plot == 'class_charts':
func = class_charts
- arg = (inputs["class_counts_df"], inputs['mapped_reads_ds'], args.out_prefix)
+ arg = (inputs["class_counts_df"], inputs['mapped_reads_ds'], args.out_prefix, args.unassigned_class)
kwd = {}
elif plot == 'rule_charts':
func = rule_charts
@@ -623,6 +634,6 @@ def err(e):
'2. Run "tiny replot --config your_run_config.yml"\n\t'
' (that\'s the processed run config) ^^^\n\n', file=sys.stderr)
+
if __name__ == '__main__':
main()
-
From e0748a0664390d74d265b8e0b47804f75af06b55 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 18 Oct 2022 14:06:08 -0700
Subject: [PATCH 07/16] Including new parameters for labelling
unknown/unassigned classes
---
doc/Parameters.md | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/doc/Parameters.md b/doc/Parameters.md
index b8ca2d63..c19634fc 100644
--- a/doc/Parameters.md
+++ b/doc/Parameters.md
@@ -254,6 +254,14 @@ The scatter plots produced by tiny-plot have rasterized points by default. This
The min and/or max bounds for plotted lengths can be set with this option. See [tiny-plot's documentation](tiny-plot.md#length-bounds) for more information about how these values are determined if they aren't set.
+### Labels for Class-related Plots
+| Run Config Key | Commandline Argument |
+|------------------------|----------------------|
+| plot_unknown_class: | `--unknown-class` |
+| plot_unassigned_class: | `--unassigned-class` |
+
+The labels that should be used for special groups in `class_charts` and `sample_avg_scatter_by_dge_class` plots. The "unknown" class group represents counts which were assigned by a Features Sheet rule which lacked a "Classify as..." label. The "unassigned" class group represents counts which weren't assigned to a feature.
+
### Full tiny-plot Help String
```
tiny-plot [-rc RAW_COUNTS] [-nc NORM_COUNTS] [-uc RULE_COUNTS]
@@ -318,4 +326,11 @@ Optional arguments:
len_dist plots will start at this value
-lda VALUE, --len-dist-max VALUE
len_dist plots will end at this value
+ -una LABEL, --unassigned-class LABEL
+ Use this label in class-related plots for unassigned
+ counts
+ -unk LABEL, --unknown-class LABEL
+ Use this label in class-related plots for counts which
+ were assigned by rules lacking a "Classify as..."
+ value
```
\ No newline at end of file
From 2f4da09775ca600d6b6a3420045eab7f38249480 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 18 Oct 2022 15:32:04 -0700
Subject: [PATCH 08/16] Updated the Features Sheet example for the new column
configuration
---
doc/Configuration.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/doc/Configuration.md b/doc/Configuration.md
index 4dbf5869..04c5f434 100644
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
@@ -124,9 +124,9 @@ Supported values are:
DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced.
## Features Sheet Details
-| _Column:_ | Select for... | with value... | Alias by... | Tag | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source |
-|------------|---------------|---------------|-------------|-----|-----------|--------|-------------------|--------|-------------|----------------|
-| _Example:_ | Class | miRNA | Name | | 1 | sense | all | all | 5' anchored | ram1.gff3 |
+| _Column:_ | Select for... | with value... | Alias by... | Classify as... | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source |
+|------------|---------------|---------------|-------------|----------------|-----------|--------|-------------------|--------|-------------|----------------|
+| _Example:_ | Class | miRNA | Name | miRNA | 1 | sense | all | all | 5' anchored | ram1.gff3 |
The Features Sheet allows you to define selection rules that determine how features are chosen when multiple features are found overlap an alignment locus. Selected features are "assigned" a portion of the reads associated with the alignment.
From 1c142dc35a4cab233e414d5154e5819d38acb136 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 18 Oct 2022 15:34:38 -0700
Subject: [PATCH 09/16] Description of class counting via the Class= attribute
has been removed from the file requirements table.
Description of the feature_counts.csv output has also been updated
---
README.md | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index f3903cc6..01f4ee53 100644
--- a/README.md
+++ b/README.md
@@ -90,12 +90,12 @@ tiny get-template
### Requirements for User-Provided Input Files
-| Input Type | File Extension | Requirements |
-|----------------------------------------------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Reference annotations
[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "):