MontgomeryLab · taimontgomery · Dec 18, 2022 · Dec 12, 2022 · Dec 12, 2022 · Dec 12, 2022
diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
@@ -291,6 +291,11 @@ plot_unknown_class: "_UNKNOWN_"
 ##-- Use this label in class plots for unassigned counts --##
 plot_unassigned_class: "_UNASSIGNED_"
 
+##-- Optionally filter the classes in class scatter plots --##
+plot_class_scatter_filter:
+  style: include  # Choose: include or exclude
+  classes: []     # Add classes between [ and ], separated by comma
+
 
 ######----------------------------- OUTPUT DIRECTORIES ------------------------------######
 #
@@ -367,4 +372,17 @@ run_deseq: True
 #
 # The following configuration settings are automatically derived from the Features Sheet
 #
-######-------------------------------------------------------------------------------######
+######-------------------------------------------------------------------------------######
+
+
+
+######--------------------------- DERIVED FROM RUN CONFIG ---------------------------######
+#
+# The following configuration settings are automatically derived from this file
+#
+######-------------------------------------------------------------------------------######
+
+##-- Utilized by tiny-plot --##
+# Filters for class scatter plots
+plot_class_scatter_filter_include: []
+plot_class_scatter_filter_exclude: []
diff --git a/doc/Parameters.md b/doc/Parameters.md
@@ -243,13 +243,22 @@ The min and/or max bounds for plotted lengths can be set with this option. See [
 
 The labels that should be used for special groups in `class_charts` and `sample_avg_scatter_by_dge_class` plots. The "unknown" class group represents counts which were assigned by a Features Sheet rule which lacked a "Classify as..." label. The "unassigned" class group represents counts which weren't assigned to a feature.
 
+### Filtering Classes in DGE Class Scatter Plots
+| Run Config Key             | Commandline Argument |
+|----------------------------|----------------------|
+| plot_class_scatter_filter: | `--classes-include`  |
+|                            | `--classes-exclude`  |
+
+If an inclusive filter is used, then only the classes in the list, if present, are shown. If an exclusive filter is used, then the listed classes are omitted from the plot. This behavior extends to features whose P value is above threshold. In the Run Config, the filter type can be set with the `style:` sub-key, and the desired list of classes for the filter can be provided between the brackets of the `classes:` sub-key
+
 ### Full tiny-plot Help String
 ```
 tiny-plot [-rc RAW_COUNTS] [-nc NORM_COUNTS] [-uc RULE_COUNTS]
           [-ss STAT] [-dge COMPARISON [COMPARISON ...]]
           [-len 5P_LEN [5P_LEN ...]] [-o PREFIX] [-pv VALUE]
           [-s MPLSTYLE] [-v] [-ldi VALUE] [-lda VALUE] [-una LABEL]
-          [-unk LABEL] -p PLOT [PLOT ...]
+          [-unk LABEL] [-ic CLASS [CLASS ...] | -ec CLASS [CLASS ...]]
+          -p PLOT [PLOT ...]
 
 This script produces basic static plots for publication as part of the tinyRNA
 workflow.
@@ -267,7 +276,7 @@ Required arguments:
                         • rule_charts: A barchart showing percentages of
                           counts by matched rule.
                         • class_charts: A barchart showing percentages of
-                          counts per Classification.
+                          counts per classification.
                         • replicate_scatter: A scatter plot comparing
                           replicates for all count files given.
                         • sample_avg_scatter_by_dge: A scatter plot comparing
@@ -315,4 +324,10 @@ Optional arguments:
                         Use this label in class-related plots for counts which
                         were assigned by rules lacking a "Classify as..."
                         value
+  -ic CLASS [CLASS ...], --classes-include CLASS [CLASS ...]
+                        Only include these classes, if present, in class
+                        scatter plots (applies regardless of P value)
+  -ec CLASS [CLASS ...], --classes-exclude CLASS [CLASS ...]
+                        Omit these classes, if present, from class scatter
+                        plots (applies regardless of P value)
 ```
diff --git a/doc/tiny-plot.md b/doc/tiny-plot.md
@@ -109,7 +109,7 @@ The P value cutoff can be changed using the [Run Config or commandline arguments
 
 
 ## sample_avg_scatter_by_dge_class
-The previous plot type can be extended to group and color differentially expressed features by class. Classes are sorted by abundance before plotting to maximize representation.
+The previous plot type can be extended to group and color differentially expressed features by class. Classes are sorted by abundance before plotting to maximize representation. You can also filter the classes displayed using [plot_class_scatter_filter](Parameters.md#filtering-classes-in-dge-class-scatter-plots)
 
 <p float="left" align="center">
     <img src="../images/plots/scatter_dge_class.jpg" width="100%" alt="sample_avg_scatter_by_dge_class"/>

diff --git a/tests/unit_tests_plotter.py b/tests/unit_tests_plotter.py
@@ -1,8 +1,6 @@
-import sys
 import unittest
-from unittest.mock import patch, call
+from unittest.mock import patch
 
-import numpy as np
 import pandas as pd
 from pandas.testing import assert_frame_equal
 from pkg_resources import resource_filename
@@ -16,11 +14,32 @@ class MyTestCase(unittest.TestCase):
     def setUpClass(cls):
         cls.stylesheet = resource_filename('tiny', 'templates/tinyrna-light.mplstyle')
 
+    #====== HELPER METHODS ===================================================
+
     def get_label_width_pairs_from_annotations_mock(self, annotations):
         bar_widths = [i[1]['xycoords'].get_width() for i in annotations.call_args_list]
         bar_labels = [i[0][0] for i in annotations.call_args_list]
         return list(zip(bar_labels, bar_widths))
 
+    def aqplt_mock(self):
+        return patch(
+            'tiny.rna.plotter.aqplt',
+            lib.plotterlib(resource_filename('tiny', 'templates/tinyrna-light.mplstyle'))
+        )
+
+    def get_empty_scatter_dge_dataframes(self):
+        counts = pd.DataFrame(
+            columns=['Feature ID', 'Classifier', 'ConditionA', 'ConditionB']
+        ).set_index(['Feature ID', 'Classifier'])
+
+        dge = pd.DataFrame(
+            columns=['Feature ID', 'Classifier', 'ConditionA_vs_ConditionB']
+        ).set_index(['Feature ID', 'Classifier'])
+
+        return counts, dge
+
+    #====== TESTS =============================================================
+
     """Are class counts properly calculated?"""
 
     def test_class_counts(self):
@@ -128,6 +147,37 @@ def test_proportion_chart_percentage_unassigned(self):
 
         assert_frame_equal(actual_below_thresh, expected_below_thresh, check_dtype=False, check_like=True)
 
+    """Does scatter_by_dge do the right thing when DataFrame inputs are empty?"""
+
+    def test_scatter_by_dge_empty_dataframes(self):
+        counts, dge = self.get_empty_scatter_dge_dataframes()
+
+        with patch('tiny.rna.plotter.save_plot') as save_plot, self.aqplt_mock():
+            plotter.scatter_by_dge(counts, dge, 'dummy_prefix', (0, 0))
+
+        save_plot.assert_not_called()
+
+    """Does scatter_by_dge_class do the right thing when DataFrame inputs are empty?"""
+
+    def test_scatter_by_dge_class_empty_dataframes(self):
+        counts, dge = self.get_empty_scatter_dge_dataframes()
+
+        with patch('tiny.rna.plotter.save_plot') as save_plot, self.aqplt_mock():
+            plotter.scatter_by_dge_class(counts, dge, 'dummy_prefix', (0, 0))
+
+        save_plot.assert_not_called()
+
+    """Does scatter_by_dge_class properly handle empty inclusive filter lists?"""
+
+    def test_scatter_dge_class_empty_inclusive_filter(self):
+        counts, dge = self.get_empty_scatter_dge_dataframes()
+
+        with patch('tiny.rna.plotter.plotterlib.scatter_grouped') as scatter, self.aqplt_mock():
+            plotter.scatter_by_dge_class(counts, dge, 'dummy_prefix', (0, 0), include=[])
+            scatter.assert_not_called()
+            plotter.scatter_by_dge_class(counts, dge, 'dummy_prefix', (0, 0), exclude=[])
+            scatter.assert_not_called()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tiny/cwl/tools/tiny-plot.cwl b/tiny/cwl/tools/tiny-plot.cwl
@@ -88,6 +88,22 @@ inputs:
       prefix: -una
     doc: 'Use this label in class-related plots for unassigned counts'
 
+  classes_include:
+    type: string[]?
+    inputBinding:
+      prefix: -ic
+    doc: \
+      'Only include these classes, if present, in class scatter '
+      'plots (applies regardless of P value)'
+
+  classes_exclude:
+    type: string[]?
+    inputBinding:
+      prefix: -ec
+    doc: \
+      'Omit these classes, if present, from class scatter plots '
+      '(applies regardless of P value)'
+
   out_prefix:
     type: string?
     inputBinding:

diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl
@@ -103,6 +103,8 @@ inputs:
   plot_pval: float?
   plot_unknown_class: string?
   plot_unassigned_class: string?
+  plot_class_scatter_filter_include: string[]?
+  plot_class_scatter_filter_exclude: string[]?
 
   # output directory names
   dir_name_bt_build: string
@@ -258,6 +260,8 @@ steps:
           $(self.length ? self[0] : null)
       unknown_class_label: plot_unknown_class
       unassigned_class_label: plot_unassigned_class
+      classes_include: plot_class_scatter_filter_include
+      classes_exclude: plot_class_scatter_filter_exclude
       dge_pval: plot_pval
       style_sheet: plot_style_sheet
       out_prefix: run_name

diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py
@@ -136,6 +136,29 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No
         else:
             return destination
 
+    def setup_step_inputs(self):
+        """For now, only tiny-plot requires additional setup for step inputs
+        This function is called at both startup and resume"""
+
+        def setup_tiny_plot_inputs():
+            cs_filter = 'plot_class_scatter_filter'
+            style_req = ['include', 'exclude']
+            classes = self.get(cs_filter, {}).get('classes')  # backward compatibility
+            if not classes: return
+
+            # Validate filter style
+            style = self[cs_filter]['style'].lower()
+            assert style in style_req, \
+                f'{cs_filter} -> style: must be {" or ".join(style_req)}.'
+
+            # Assign the workflow key and reset the other filter(s)
+            self[f"{cs_filter}_{style}"] = classes.copy()
+            style_req.remove(style)
+            for style in style_req:
+                self[f"{cs_filter}_{style}"] = []
+
+        setup_tiny_plot_inputs()
+
     def create_run_directory(self) -> str:
         """Create the destination directory for pipeline outputs"""
         run_dir = self["run_directory"]
@@ -192,6 +215,7 @@ def __init__(self, config_file: str, validate_inputs=False):
         self.setup_ebwt_idx()
         self.process_samples_sheet()
         self.process_features_sheet()
+        self.setup_step_inputs()
         if validate_inputs: self.validate_inputs()
 
     def load_paths_config(self):