diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index 9358ee6a..903843a7 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -285,6 +285,10 @@ plot_vector_points: False plot_len_dist_min: plot_len_dist_max: +##-- Optionally set the log2 min and/or max view limits for scatter_by_dge plots; auto if unset --## +plot_dge_scatter_min: +plot_dge_scatter_max: + ##-- Use this label in class plots for counts assigned by rules lacking a classifier --## plot_unknown_class: "_UNKNOWN_" @@ -322,7 +326,7 @@ dir_name_plotter: plots # ########################################################################################### -version: 1.2.1 +version: 1.2.2 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # diff --git a/dev/macos_pycharm_r_plugin_patch.py b/dev/macos_pycharm_r_plugin_patch.py new file mode 100644 index 00000000..f3513878 --- /dev/null +++ b/dev/macos_pycharm_r_plugin_patch.py @@ -0,0 +1,46 @@ + +"""Addresses the error 'RWrapper terminated, exit code: 134 ... rpath ...' + +This issue is specific to macOS. The R Language plugin for Pycharm crashes when the R has been installed +via Conda. The issue is described here: https://youtrack.jetbrains.com/issue/R-1271 + +The issue is still present in Pycharm 2022.3.1. +This patch will need to be applied every time the plugin is updated.""" + +import platform +import sys +import os +import re + +from glob import glob + +if platform.system() != 'Darwin': + sys.exit("This patch is only for macOS") + +# Get PyCharm directory most recently modified (assumed to be the latest version) +home_dir = os.path.expanduser("~") +pycharm_dirs = glob(f"{home_dir}/Library/Application Support/JetBrains/PyCharm*") +latest_pycharm = sorted(pycharm_dirs, key=os.path.getmtime)[-1] + +# The replacement R function +patched_fn = """getLDLibraryPath <- function() { + conda_path <- Sys.getenv("CONDA_EXE", unset = NA) + if (!is.na(conda_path)) conda_path = dirname(dirname(conda_path)) + r_path <- Sys.getenv("R_HOME", unset = NA) + if (!is.na(r_path)) r_path = dirname(dirname(dirname(dirname(r_path)))) + + if (get_os() == "osx" && r_path != conda_path) Sys.getenv("DYLD_FALLBACK_LIBRARY_PATH") + else if (get_os() == "linux") Sys.getenv("LD_LIBRARY_PATH") + else "" +}""" + +# Apply the patch +target_file = f"{latest_pycharm}/plugins/r-plugin/R/GetEnvVars.R" +with open(target_file, "r+") as f: + file_contents = f.read() + patched = re.sub(r"getLDLibraryPath <- function\(\) {.*}", patched_fn, file_contents, flags=re.DOTALL) + + # Overwrite the file + f.seek(0) + f.write(patched) + f.truncate() \ No newline at end of file diff --git a/doc/Parameters.md b/doc/Parameters.md index 6bc2f52a..e1e5997d 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -235,6 +235,14 @@ The scatter plots produced by tiny-plot have rasterized points by default. This The min and/or max bounds for plotted lengths can be set with this option. See [tiny-plot's documentation](tiny-plot.md#length-bounds) for more information about how these values are determined if they aren't set. +### Bounds for scatter_by_dge Plots +| Run Config Key | Commandline Argument | +|-----------------------|----------------------| +| plot_dge_scatter_min: | `--dge-min VALUE` | +| plot_dge_scatter_max: | `--dge-max VALUE` | + +The min and/or max bounds for DGE scatter plots can be set with this option. The value you provide should be a log2 count value and can be whole or fractional, e.g. `--dge-min 1.9` would produce a plot whose first tick mark is labeled 2 and would include points for feature counts as low as 3.74. Unspecified bounds are automatically calculated to fit the data, and will include the margin specified by the `axes.[x/y]margin` key in the [Plot Stylesheet](Configuration.md#plot-stylesheet-details). + ### Labels for Class-related Plots | Run Config Key | Commandline Argument | |------------------------|----------------------| @@ -256,9 +264,9 @@ If an inclusive filter is used, then only the classes in the list, if present, a tiny-plot [-rc RAW_COUNTS] [-nc NORM_COUNTS] [-uc RULE_COUNTS] [-ss STAT] [-dge COMPARISON [COMPARISON ...]] [-len 5P_LEN [5P_LEN ...]] [-o PREFIX] [-pv VALUE] - [-s MPLSTYLE] [-v] [-ldi VALUE] [-lda VALUE] [-una LABEL] - [-unk LABEL] [-ic CLASS [CLASS ...] | -ec CLASS [CLASS ...]] - -p PLOT [PLOT ...] + [-s MPLSTYLE] [-v] [-ldi VALUE] [-lda VALUE] [-dgi VALUE] + [-dga VALUE] [-una LABEL] [-unk LABEL] [-ic CLASS [CLASS ...] + | -ec CLASS [CLASS ...]] -p PLOT [PLOT ...] This script produces basic static plots for publication as part of the tinyRNA workflow. @@ -317,6 +325,10 @@ Optional arguments: len_dist plots will start at this value -lda VALUE, --len-dist-max VALUE len_dist plots will end at this value + -dgi VALUE, --dge-min VALUE + scatter_by_dge plots will start at this log2 value + -dga VALUE, --dge-max VALUE + scatter_by_dge plots will end at this log2 value -una LABEL, --unassigned-class LABEL Use this label in class-related plots for unassigned counts diff --git a/doc/tiny-plot.md b/doc/tiny-plot.md index 62a027fd7..551f52af 100644 --- a/doc/tiny-plot.md +++ b/doc/tiny-plot.md @@ -102,9 +102,14 @@ Differential gene expression between sample groups can be visualized with this p sample_avg_scatter_by_dge

-#### Customization -The P value cutoff can be changed using the [Run Config or commandline arguments](Parameters.md#p-value). The control condition is plotted on the x-axis, but it must be specified in your Samples Sheet prior to running an end-to-end or `tiny recount` analysis. If using `tiny replot`, is not possible to change a no-control experiment to a control experiment and have these changes reflected in these plots. This is because tiny-deseq.r must be aware of the control condition in order to perform the proper directional comparisons. +#### P value Threshold +The P value cutoff [can be changed](Parameters.md#p-value) (default: 0.05). +#### Control Conditions +The control condition is plotted on the x-axis, but it must be specified in your Samples Sheet prior to running an end-to-end or `tiny recount` analysis. If using `tiny replot`, is not possible to change a no-control experiment to a control experiment and have these changes reflected in these plots. This is because tiny-deseq.r must be aware of the control condition in order to perform the proper directional comparisons. + +#### View Limits +Both the lower and upper bound of the plot's axes [can be set manually](Parameters.md#view-limits). Unspecified bounds are automatically calculated to fit the data. diff --git a/setup.py b/setup.py index f1bbc1b1..654cc8ff 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ AUTHOR = 'Kristen Brown, Alex Tate' PLATFORM = 'Unix' REQUIRES_PYTHON = '>=3.9.0' -VERSION = '1.2.1' +VERSION = '1.2.2' REQUIRED = [] # Required packages are installed via Conda's environment.yml diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml index b59dee56..abcf1d4c 100644 --- a/tests/testdata/config_files/run_config_template.yml +++ b/tests/testdata/config_files/run_config_template.yml @@ -285,12 +285,21 @@ plot_vector_points: False plot_len_dist_min: plot_len_dist_max: +##-- Optionally set the log2 min and/or max view limits for scatter_by_dge plots; auto if unset --## +plot_dge_scatter_min: +plot_dge_scatter_max: + ##-- Use this label in class plots for counts assigned by rules lacking a classifier --## plot_unknown_class: "_UNKNOWN_" ##-- Use this label in class plots for unassigned counts --## plot_unassigned_class: "_UNASSIGNED_" +##-- Optionally filter the classes in class scatter plots --## +plot_class_scatter_filter: + style: include # Choose: include or exclude + classes: [] # Add classes between [ and ], separated by comma + ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # @@ -317,7 +326,7 @@ dir_name_plotter: plots # ########################################################################################### -version: 1.2 +version: 1.2.2 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # @@ -367,4 +376,17 @@ run_deseq: True # # The following configuration settings are automatically derived from the Features Sheet # -######-------------------------------------------------------------------------------###### \ No newline at end of file +######-------------------------------------------------------------------------------###### + + + +######--------------------------- DERIVED FROM RUN CONFIG ---------------------------###### +# +# The following configuration settings are automatically derived from this file +# +######-------------------------------------------------------------------------------###### + +##-- Utilized by tiny-plot --## +# Filters for class scatter plots +plot_class_scatter_filter_include: [] +plot_class_scatter_filter_exclude: [] \ No newline at end of file diff --git a/tiny/cwl/tools/tiny-plot.cwl b/tiny/cwl/tools/tiny-plot.cwl index 1edcc7a0..364e2e93 100644 --- a/tiny/cwl/tools/tiny-plot.cwl +++ b/tiny/cwl/tools/tiny-plot.cwl @@ -74,6 +74,18 @@ inputs: prefix: -lda doc: "The last length to plot in the range for len_dist plots" + dge_min: + type: double? + inputBinding: + prefix: -dgi + doc: "The log2 lower view limit in DGE scatter plots" + + dge_max: + type: double? + inputBinding: + prefix: -dga + doc: "The log2 upper view limit in DGE scatter plots" + unknown_class_label: type: string? inputBinding: diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index ae5be1af..ce7ee91d 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -99,6 +99,8 @@ inputs: plot_vector_points: boolean? plot_len_dist_min: int? plot_len_dist_max: int? + plot_dge_scatter_min: double? + plot_dge_scatter_max: double? plot_style_sheet: File? plot_pval: float? plot_unknown_class: string? @@ -258,6 +260,8 @@ steps: pickValue: all_non_null valueFrom: | $(self.length ? self[0] : null) + dge_min: plot_dge_scatter_min + dge_max: plot_dge_scatter_max unknown_class_label: plot_unknown_class unassigned_class_label: plot_unassigned_class classes_include: plot_class_scatter_filter_include diff --git a/tiny/rna/plotter.py b/tiny/rna/plotter.py index 2ff72db2..e0bd13bd 100644 --- a/tiny/rna/plotter.py +++ b/tiny/rna/plotter.py @@ -65,6 +65,10 @@ def get_args(): help='len_dist plots will start at this value') optional_args.add_argument('-lda', '--len-dist-max', metavar='VALUE', type=int, help='len_dist plots will end at this value') + optional_args.add_argument('-dgi', '--dge-min', metavar='VALUE', type=float, + help='scatter_by_dge plots will start at this log2 value') + optional_args.add_argument('-dga', '--dge-max', metavar='VALUE', type=float, + help='scatter_by_dge plots will end at this log2 value') optional_args.add_argument('-una', '--unassigned-class', metavar='LABEL', default='_UNASSIGNED_', help='Use this label in class-related plots for unassigned counts'), optional_args.add_argument('-unk', '--unknown-class', metavar='LABEL', default='_UNKNOWN_', @@ -163,7 +167,7 @@ def get_len_dist_dict(files_list: list) -> DefaultDict[str, Dict[str, pd.DataFra # File does not appear to have been produced by the pipeline condition_and_rep = basename - subtype = "Assigned" if "assigned" in condition_and_rep else "Mapped" + subtype = "assigned" if "assigned" in condition_and_rep else "mapped" matrices[subtype][condition_and_rep] = pd.read_csv(file, index_col=0) return matrices @@ -186,7 +190,7 @@ def class_charts(raw_class_counts: pd.DataFrame, mapped_reads: pd.Series, out_pr for library in raw_class_counts: chart = aqplt.barh_proportion(class_props[library], max_prop, scale, **kwargs) - chart.set_title("Percentage of Small RNAs by Class") + chart.set_title("Percentage of small RNAs by class") chart.set_ylabel("Class") # Save the plot @@ -214,7 +218,7 @@ def rule_charts(rule_counts: pd.DataFrame, out_prefix: str, scale=2, **kwargs): for library, prop_df in rule_props.items(): chart = aqplt.barh_proportion(prop_df, max_prop, scale, **kwargs) - chart.set_title("Percentage of Small RNAs by Matched Rule") + chart.set_title("Percentage of small RNAs by matched rule") chart.set_ylabel("Rule") # Save the plot @@ -405,7 +409,7 @@ def scatter_by_dge_class(counts_avg_df, dges, output_prefix, view_lims, include= aqplt.set_dge_class_legend_style() for pair in dges: - ut, tr = pair.split("_vs_") # untreated, treated + tr, ut = pair.split("_vs_") # treated, untreated dge_classes = dges[dges[pair] < pval].groupby(level=1).groups labels, grp_args = zip(*dge_classes.items()) if dge_classes else ((), ()) @@ -442,7 +446,7 @@ def scatter_by_dge(counts_avg_df, dges, output_prefix, view_lims, pval=0.05): for pair in dges: grp_args = dges.index[dges[pair] < pval] - ut, tr = pair.split("_vs_") # untreated, treated + tr, ut = pair.split("_vs_") # treated, untreated labels = ['p < %g' % pval] if not grp_args.empty else [] colors = aqplt.assign_class_colors(labels) @@ -615,7 +619,7 @@ def setup(args: argparse.Namespace) -> dict: 'sample_rep_dict': lambda: get_sample_rep_dict(fetched["norm_counts_df"]), 'norm_counts_avg_df': lambda: get_sample_averages(fetched["norm_counts_df"], fetched["sample_rep_dict"]), 'class_counts_df': lambda: get_class_counts(fetched["raw_counts_df"]), - 'avg_view_lims': lambda: aqplt.get_scatter_view_lims(fetched["norm_counts_avg_df"]), + 'avg_view_lims': lambda: aqplt.get_scatter_view_lims(fetched["norm_counts_avg_df"], args.dge_min, args.dge_max), 'norm_view_lims': lambda: aqplt.get_scatter_view_lims(fetched["norm_counts_df"].select_dtypes(['number'])) } diff --git a/tiny/rna/plotterlib.py b/tiny/rna/plotterlib.py index e5afb74e..b3def66a 100644 --- a/tiny/rna/plotterlib.py +++ b/tiny/rna/plotterlib.py @@ -77,9 +77,9 @@ def len_dist_bar(self, size_prop: pd.DataFrame, subtype: str, **kwargs) -> plt.A sizeb = size_prop.plot(kind='bar', stacked=True, reuse_plot=True, **kwargs) sizeb.tick_params(axis='x', labelsize=font_size, rotation=0) sizeb.set_ylim(0, np.max(np.sum(size_prop, axis=1)) + 0.025) - sizeb.set_title(f'Distribution of {subtype} Reads') - sizeb.set_ylabel('Proportion of Reads') - sizeb.set_xlabel('Length of Sequence') + sizeb.set_title(f'Distribution of {subtype} reads') + sizeb.set_ylabel('Proportion of reads') + sizeb.set_xlabel('Length of sequence') return sizeb @@ -134,7 +134,7 @@ def barh_proportion(self, prop_ds: pd.Series, max_prop=1.0, scale=2, **kwargs) - # Create the plot and set plot attributes cbar = (prop_ds * 100).plot(kind='barh', ax=ax, color=bar_colors, sort_columns=False, **kwargs) cbar.xaxis.set_major_formatter(tix.PercentFormatter()) - cbar.set_xlabel('Percentage of Reads') + cbar.set_xlabel('Percentage of reads') cbar.set_xlim(0, min([(max_prop * 100) + 10, 100])) # Remove irrelevant plot attributes @@ -321,28 +321,54 @@ def set_dge_class_legend_style(self): scatter.set_position(orig_axes_pos.transformed(transFigure.inverted())) @staticmethod - def get_scatter_view_lims(counts_df: pd.DataFrame) -> Tuple[float, float]: - """Calculates scatter view limits for the counts dataframe""" + def get_scatter_view_lims(counts_df: pd.DataFrame, vmin: int = None, vmax: int = None) -> Tuple[float, float]: + """Calculates scatter view limits for the counts dataframe - x0 = counts_df.min(axis='columns').where(lambda x: x != 0).dropna().min() + Args: + counts_df: A pandas dataframe of counts per feature + vmin: Optional log2 minimum view limit + vmax: Optional log2 maximum view limit + """ + + # For transforming values to/from log2 scale + transform = LogTransform(base=2) + inverse_trans = transform.inverted() + + # User-specified min & max, no calculation necessary + if (vmin, vmax) != (None, None): + return inverse_trans.transform([vmin, vmax]) + + # Obtain the minimum and maximum counts from the counts dataframe + x0 = counts_df.replace(0, pd.NA).min(axis="columns").dropna().min() x1 = counts_df.max().max() minpos = 1e-300 if not np.isfinite([x0, x1]).all() or not isinstance(x0, np.float) or x1 <= 0: - print("The provided dataset contains invalid values.") + print("The provided dataset contains invalid values.", file=sys.stderr) return (minpos, minpos) + # Avoid log2(0) errors x0, x1 = (minpos if x0 <= 0 else x0, minpos if x1 <= 0 else x1) - transform = LogTransform(base=2) - inverse_trans = transform.inverted() + # Get axes margin preferences from stylesheet + rc_mar = {mpl.rcParams.get(f"axes.{m}", 0) + for m in ('xmargin', 'ymargin')} + + margin = max(rc_mar) + if len(rc_mar) != 1: + print("Stylesheet values for axes.xmargin and axes.ymargin differ. " + "The larger value will be chosen for the scatter plot margin.", + file=sys.stderr) + # Calculate plot margins x0t, x1t = transform.transform([x0, x1]) - delta = (x1t - x0t) * mpl.rcParams.get('axes.xmargin', 0) + delta = (x1t - x0t) * margin if not np.isfinite(delta): delta = 0 - return inverse_trans.transform([x0t - delta, x1t + delta]) + if vmin is None: vmin = x0t - delta + if vmax is None: vmax = x1t + delta + return inverse_trans.transform([vmin, vmax]) @staticmethod def set_square_scatter_view_lims(ax: plt.Axes, min_max=None): @@ -392,7 +418,7 @@ def set_scatter_ticks(self, ax: plt.Axes, minor_ticks=False): """Intelligently creates major and minor ticks for a square scatter plot while avoiding crowding""" # Get tick locations corresponding to the current view limits - major_locs, ax_min, ax_max = self.get_fixed_majorticklocs(ax.viewLim.bounds) + major_locs, ax_min, ax_max = self.get_fixed_majorticklocs(ax.viewLim.extents) ax.xaxis.set_major_locator(tix.FixedLocator(major_locs)) ax.yaxis.set_major_locator(tix.FixedLocator(major_locs)) @@ -403,7 +429,7 @@ def set_scatter_ticks(self, ax: plt.Axes, minor_ticks=False): for axis in [ax.xaxis, ax.yaxis]: # Only display every nth major tick label - n = int(np.log2(len(major_locs)) - 1) + n = int(np.log2(len(major_locs)) - 1) or 1 ticks_displayed, last_idx = self.every_nth_label(axis, n) if minor_ticks: diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r index 2960239a..0f98aa3e 100755 --- a/tiny/rna/tiny-deseq.r +++ b/tiny/rna/tiny-deseq.r @@ -211,8 +211,8 @@ for (i in seq_len(nrow(all_comparisons))){ result_df <- df_with_metadata(data.frame(deseq_res[order(deseq_res$padj),])) # Resolve original condition names for use in output filename - cond1 <- sampleConditions[[comparison[1]]] - cond2 <- sampleConditions[[comparison[2]]] + cond1 <- sampleConditions[[comparison[2]]] + cond2 <- sampleConditions[[comparison[1]]] write_dge_table(result_df, cond1, cond2) if (!has_control){ diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 272c0551..c1e239d5 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -285,6 +285,10 @@ plot_vector_points: False plot_len_dist_min: plot_len_dist_max: +##-- Optionally set the log2 min and/or max view limits for scatter_by_dge plots; auto if unset --## +plot_dge_scatter_min: +plot_dge_scatter_max: + ##-- Use this label in class plots for counts assigned by rules lacking a classifier --## plot_unknown_class: "_UNKNOWN_" @@ -322,7 +326,7 @@ dir_name_plotter: plots # ########################################################################################### -version: 1.2.1 +version: 1.2.2 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### #