From 859bd1126ea1e507fb1e3ed5494767912221e7b6 Mon Sep 17 00:00:00 2001 From: YoungKrug <31873199+YoungKrug@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:21:36 -0400 Subject: [PATCH 1/5] Squashed commit of the following: commit cb2fed761b2b92336b585aec7a8a984249a9cbc9 Author: Allison R Mason <31289802+amason30@users.noreply.github.com> Date: Thu Aug 8 13:13:46 2024 -0400 Release articles (#19) * Add functionality from PR 16 (#17) * Created RMD check full for our R package. * Create test-coverage.yml * Create lintr.yml * Release articles (#10) * set up filter article * add basic readme - consider adding background from original program manual * design logo? * switch article to quarto * add functionality for extracting a single filter summary * add check for technical replicates * add documentation for filter_summary * convert prints to cli and add checks for filter functions * add functionality to return a data.table of similar ions for mispicked workflow * add full draft filter article * update vignette builder * add usethis use_articles created .gitignore * initalize website with pkgdown * Relase bug fixes (#8) (#9) * Added functionality for multiple group filters. * Copy functionality added! * Custom print statement added! * RCMD check works! * Fixed incorrect summary after differnt filter runs Co-authored-by: GregJohnson <31873199+YoungKrug@users.noreply.github.com> * fix typo to pass RCMD * fix partial group match warning in tests * remove non-exported function documentation * remove non-exported function documentation * add formatted website * demostrate copy and print functionality * add functionality to extract group averages table) * show step by step tree map * update text * return peak table, not a print of peak table * add accessor for cv values, error if cv filter has not been run failing * update accessor names * fix cv accessor test fail by changing name * fix typos * store raw peak table * remove man pages after changing names * set print inside class * Modified the constructor to allow for data.frames and data.tables to be passed through. * get_group_averages should return group averages of the filtered table at any stage * update variable names to be inclusive of data frame as input * add cv plot * show cv plot of cv filtered data only * format cv threshold plot for the article * add option for merging method, all tests pass * Add new method parameter to documentation * Require users to choose a cv_threshold * add new accessors to website * update roxygen version * create an example dataset that has both solvent and media blanks to show filtering both * Add in new example dataset, update filters, and show mutliple group filters * add downstream analysis article * add get_cv_data reference * add articles ignore * figure out how to order articles - does not work * update todo --------- Co-authored-by: GregJohnson <31873199+YoungKrug@users.noreply.github.com> Co-authored-by: GregJohnson * fix documentation errors in R CMD - check and all tests pass * ensure articles build * add vignette builder * Revert "fix documentation errors in R CMD - check and all tests pass" This reverts commit f40cd276f43874162d8cbad9d251ab7221afd422. * changes to pass check * fix documentation warnings * no vignettes to build * add line ending to fix incomplete line warning * explain reference semantics sooner to introduce copy_object * run site through github actions * add in website formatting * see if quarto dependency fixes pkgdown namespace error * add quarto depends to website only * removing quarto from suggets cuases error in assigning vignetter builder * Update pkgdown.yaml add quarto to actions * add lines to solve endline error * Update DESCRIPTION * Update pkgdown.yaml, removed white spaces * load library, not load_all * Update _pkgdown.yml * article packages to website needs * Update pkgdown.yaml * dont use vignette builder - render as html * Create pkgdown_quarto.yml * run with default build_site * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * go back to quarto vignette * Pkgdown Fixes (#12) * Created RMD check full for our R package. * Create test-coverage.yml * Create lintr.yml * explain reference semantics sooner to introduce copy_object * run site through github actions * add in website formatting * see if quarto dependency fixes pkgdown namespace error * add quarto depends to website only * removing quarto from suggets cuases error in assigning vignetter builder * Update pkgdown.yaml add quarto to actions * add lines to solve endline error * Update DESCRIPTION * Update pkgdown.yaml, removed white spaces * load library, not load_all * Update _pkgdown.yml * article packages to website needs * Update pkgdown.yaml * dont use vignette builder - render as html * Create pkgdown_quarto.yml * run with default build_site * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * go back to quarto vignette --------- Co-authored-by: Allison R Mason <31289802+amason30@users.noreply.github.com> Co-authored-by: amason30 * Prep for release (#14) * Created RMD check full for our R package. * Create test-coverage.yml * Create lintr.yml * explain reference semantics sooner to introduce copy_object * run site through github actions * add in website formatting * see if quarto dependency fixes pkgdown namespace error * add quarto depends to website only * removing quarto from suggets cuases error in assigning vignetter builder * Update pkgdown.yaml add quarto to actions * add lines to solve endline error * Update DESCRIPTION * Update pkgdown.yaml, removed white spaces * load library, not load_all * Update _pkgdown.yml * article packages to website needs * Update pkgdown.yaml * dont use vignette builder - render as html * Create pkgdown_quarto.yml * run with default build_site * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * Update pkgdown_quarto.yml * go back to quarto vignette * update todo * start formatter functions for mpactr class construction - tests pass, need to add to the import and constructor functions and test * Allow users to submit feature tables exported from different tools - need to fix global bindings warning for formatter script Co-authored-by: GregJohnson * update TODO * Fix for formatter issues! * Alpha numeric fix * add checks for expected meta_data columns Co-authored-by: GregJohnson * reorganize article to explain the example function sooner * remove notes * test coverage is 100 % for all main scripts * reduce size of example metabpscape df - no more CMD note * fix order of articles on website --------- Co-authored-by: Allison R Mason <31289802+amason30@users.noreply.github.com> Co-authored-by: amason30 Co-authored-by: GregJohnson * Fix for Ubuntu failing test. * does adding project help? * Update pkgdown.yaml most recent example with quarto added * add project file * change qmd to rmd for pkgdown github actions website * Fix for r-cmd checks * Set mimimum R depend for base pipe * Update r.yml * Added a new lintr workflow * Update lintr.yml * Linted, styled the package! * Lintr (#16) * lint and style R dir - need to fix complexity error * lint and style raw data scripts - no errors * update name to snakecase * update manual pages after lint and styling * Fix small typos in filters documentation * Test have been linted! * Revert "Merge branch 'lintr' of https://github.com/SchlossLab/mpactR into lintr" This reverts commit 83e6f3e7c636c9e6254c396842d7e724958662a9, reversing changes made to 318927f66ed22048a57b3555c26b3a0a8a4073f5. * fix typos in articles an lint/style * add back typo fixes in filters documentation * fix character limit lintr issue * add an informative error message a filter has already been applied to input data object * add a designated reference semantics article * add copy_object explaination in filters details * fixed code to resolve knitting error * Fix for cyclomatic complexity. * Add in chemistry documentation suggestions * We don't recommend setting bounds to account for 0's in the table * add expanded section for R6 specifics * Link filter article on main page * be consistent with '.' in documentation * Fix typos and add in suggestions * allow https url for data input --------- Co-authored-by: YoungKrug <31873199+YoungKrug@users.noreply.github.com> --------- Co-authored-by: GregJohnson <31873199+YoungKrug@users.noreply.github.com> Co-authored-by: GregJohnson * remove quarto articles * add filter article with original mpact dataset * remove numeric name requirement and add informative message for group summary * check passed_ions as well, not failed_ions twice * use original mpact data for filter article * update readme with me article link --------- Co-authored-by: GregJohnson <31873199+YoungKrug@users.noreply.github.com> Co-authored-by: GregJohnson --- R/filter_pactr-methods.R | 11 +- R/summary-class.R | 3 +- README.Rmd | 5 +- README.md | 5 +- _pkgdown.yml | 2 +- vignettes/articles/downstream_analyses.qmd | 780 --------------------- vignettes/articles/filter.qmd | 392 ----------- vignettes/articles/filter2.Rmd | 464 ++++++++++++ 8 files changed, 481 insertions(+), 1181 deletions(-) delete mode 100644 vignettes/articles/downstream_analyses.qmd delete mode 100644 vignettes/articles/filter.qmd create mode 100644 vignettes/articles/filter2.Rmd diff --git a/R/filter_pactr-methods.R b/R/filter_pactr-methods.R index 780de8c..8801c14 100644 --- a/R/filter_pactr-methods.R +++ b/R/filter_pactr-methods.R @@ -215,10 +215,17 @@ filter_pactr$set( } ) + # Given a group name, removes flagged ions from the peak table. filter_pactr$set( "public", "apply_group_filter", function(group, remove_ions = TRUE) { + groups <- unique(self$mpactr_data$get_meta_data()$Biological_Group) + if (isFALSE(group %in% groups)) { + cli::cli_abort("{.var group} {group} is not in {.var Biological_Group}. + Options are: {groups}") + } + cli::cli_alert_info("Parsing {nrow(self$mpactr_data$get_peak_table())} peaks based on the following sample group: {group}.") @@ -239,7 +246,7 @@ filter_pactr$set( self$logger$list_of_summaries[[paste0("group-", group)]] <- summary$new( filter = group, - failed_ions = as.numeric(ions), + failed_ions = ions, passed_ions = self$mpactr_data$get_peak_table()$Compound ) @@ -362,8 +369,6 @@ filter_pactr$set( cut_tree <- stats::cutree(cluster, h = 1 - cluster_threshold) x <- as.data.table(cut_tree, keep.rownames = "Compound")[ - , Compound := as.numeric(Compound) - ][ group_1, on = .(Compound = Compound) ][ diff --git a/R/summary-class.R b/R/summary-class.R index d8341df..b53a163 100644 --- a/R/summary-class.R +++ b/R/summary-class.R @@ -3,7 +3,8 @@ summary <- R6::R6Class("summary", initialize = function(filter, failed_ions, passed_ions) { stopifnot(any(class(filter) == "character")) stopifnot(any(class(failed_ions) == c("numeric", "character"))) - stopifnot(any(class(failed_ions) == c("numeric", "character"))) + stopifnot(any(class(passed_ions) == c("numeric", "character"))) + private$filter <- filter private$failed_ions <- failed_ions private$passed_ions <- passed_ions diff --git a/README.Rmd b/README.Rmd index 781a871..584bca4 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,7 +29,7 @@ Filters in this package address the following issues: - `filter_cv()`: removal of non-reproducible features, or those that are inconsistent between technical replicates. - `filter_insource_ions()`: removal of fragment ions created during the first ionization in the tandem MS/MS workflow. -All filters are independent, meaning they can be used to create a project-specific workflow, or you can learn more in **cite vignette**. +All filters are independent, meaning they can be used to create a project-specific workflow, or you can learn more in [the Filter vignette](articles/filter2.html). ## Installation @@ -42,7 +42,8 @@ devtools::install_github("SchlossLab/mpactR") ## Get started -See [the Filter vignette](articles/filter.html) to get started. +See [the Filter vignette](articles/filter2.html) to get started. + ## Getting help diff --git a/README.md b/README.md index b00ff5c..01050fc 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ Filters in this package address the following issues: first ionization in the tandem MS/MS workflow. All filters are independent, meaning they can be used to create a -project-specific workflow, or you can learn more in **cite vignette**. +project-specific workflow, or you can learn more in [the Filter +vignette](articles/filter2.html). ## Installation @@ -39,7 +40,7 @@ devtools::install_github("SchlossLab/mpactR") ## Get started -See [the Filter vignette](articles/filter.html) to get started. +See [the Filter vignette](articles/filter2.html) to get started. ## Getting help diff --git a/_pkgdown.yml b/_pkgdown.yml index 1f66694..5695068 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -49,7 +49,7 @@ navbar: - text: Reference Semantics href: articles/reference_semantics.html - text: Filter - href: articles/filter.html + href: articles/filter2.html - text: Additional Analyses href: articles/downstream_analyses.html right: diff --git a/vignettes/articles/downstream_analyses.qmd b/vignettes/articles/downstream_analyses.qmd deleted file mode 100644 index 71b3d5a..0000000 --- a/vignettes/articles/downstream_analyses.qmd +++ /dev/null @@ -1,780 +0,0 @@ ---- -title: "Downstream Analyses" -vignette: > - %\VignetteIndexEntry{Downstream Analyses} - %\VignetteEngine{quarto::html} - %\VignetteEncoding{UTF-8} ---- - -The goal of mpactR is to correct for errors that occur during the pre-processing of raw tandem MS/MS data. Once filtering in complete, you should have a feature table containing high quality ms1 features. This table can be used for a variety of analyses that can be conducted in R to better understand if and how samples differ from one another. - -In this article, we will highlight just a few analyses. This is not an exhaustive list, and there are certainly other ways to conduct the analyses show below - the beauty of R! We will walk you through how to do the following: - -- creating an interactive plot of input features and the filters they failed, if any, using `ggplot` and `plotly` -- correlating sample profiles at mutliple levels (sample, technical replicates, and biological replicates) using `Hmisc` and `corrplot` -- visualizing sample clustering as a dendrogram with `ggdendro` -- differential abundance analysis: - * calculating fold change between two biological groups - * visualizing fold change, m/z, and retention time as a 3D scatter plot - * conducting t-tests for fold change differences, calculating p-values, and calculating q-values for false discovery rate (FDR) correction - * visualizing significant fold change differences in a volcano plot - -```{r} -#| include: false -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Load required libraries - -We will be using multiple libraries for data analysis and visualization, which can be loaded below. If you do not have some or all of the packages listed, you can install them with `install.packages("packagename")`. - -```{r setup} -#| output: false -# library(mpactR) -devtools::load_all() -library(viridis) -library(plotly) -library(data.table) -library(tidyverse) -library(Hmisc) -library(corrplot) -library(ggdendro) -library(ggtext) -``` - -# Filter ms1 feature table with `mpactR` - -```{r} -data <- import_data(example("cultures_peak_table.csv"), - example("cultures_metadata.csv")) -``` - -We will be using example data in the `mpactR` package, which has `r nrow(get_meta_data(data))` samples and `r length(unique(get_meta_data(data)$Biological_Group))` biological groups. Biological groups are: `r unique(get_meta_data(data)$Biological_Group)`. - -We will check our feature table for mispicked ions, remove solvent blank and media blanks features with an abundance > 0.01, relative to other groups, and check ions for replicability. - -```{r} -data_filtered <- data |> - filter_mispicked_ions(merge_peaks = TRUE, merge_method = "sum") |> - filter_group(group_to_remove = "Solvent_Blank") |> - filter_group(group_to_remove = "Media") |> - filter_cv(cv_threshold = 0.2, cv_param = "median") -``` - -Overall, `r nrow(get_peak_table(data_filtered))` ions remain in the feature table. A summary of the filtering, as a tree map is below: - -```{r} -plot_qc_tree(data_filtered) -``` - -# Interactive scatter plot of all input ions an their fate during filtering - -Visualizing each compound by m/z and retention time, and their fate during filtering may be useful to see if filters are removing features at certain retention time or m/z ranges. To create this plot, we first need to extract the raw data table, which has all pre-filtered ions (including m/z and retention time). We can do this with the mpactR function `get_raw_data()`, and then select the Compound, mz, and rt columns. - -```{r} -get_raw_data(data_filtered) %>% - select(Compound, mz, rt) %>% - head() -``` - -Next, we need to extract the ion status with `mpactR` function `qc_summary()`. This function returns a `data.table` reporting the ion status for each input ion. This includes which filter each ion failed, or passing, if the ion passed all applied filters. - -```{r} -qc_summary(data_filtered) %>% - head() -``` - -We can join these two `data.table`s for plotting data set: - -```{r} -get_raw_data(data_filtered) %>% - select(Compound, mz, rt) %>% - left_join(qc_summary(data_filtered), - by = join_by("Compound" == "compounds")) %>% - head() -``` - -Now we can create a scatter plot to show the input features (m/z ~ retention time) and their fate (status) using `ggolot` and `geom_point`. - -```{r} -get_raw_data(data_filtered) %>% - select(Compound, mz, rt) %>% - left_join(qc_summary(data_filtered), - by = join_by("Compound" == "compounds")) %>% - ggplot() + - aes(x = rt, y = mz, color = status) + - geom_point() + - viridis::scale_color_viridis(discrete = TRUE) + - labs(x = "Retention time (min)", - y = "m/z", - color = "Ion Status")+ - theme_bw() + - theme(legend.position = "top") -``` - -We can also make the plot interactive with the `plotly` pacakge function `ggplotly`. - -```{r} -feature_plot <- get_raw_data(data_filtered) %>% - select(Compound, mz, rt) %>% - left_join(qc_summary(data_filtered), - by = join_by("Compound" == "compounds")) %>% - ggplot() + - aes(x = rt, y = mz, color = status, - text = paste0("Compound: ", Compound)) + - geom_point() + - viridis::scale_color_viridis(discrete = TRUE) + - labs(x = "Retention time (min)", - y = "m/z", - color = "Ion Status")+ - theme_bw() + - theme(legend.position = "top") - -ggplotly(feature_plot) -``` - - -# Correlation between samples at different levels - -We may be interested in how individual samples, or biological group sample correlate to each other. For this analysis, we will need to extract the filtered feature table with `mpactR` function `get_peak_table()`. This function will return the filtered `data.table` where rows are features and columns are either compound metadata or intensity values for samples. - -```{r} -ft <- get_peak_table(data_filtered) - -ft[1:5, 1:7] -``` - -### Correlation between each individual samples - -First, we can look at the correlation between each individual sample, regadless of technical replicates and biological groups. This will show how well technical and biological replicates correlate, where we are expecting to see near perfect correlation replicates. - -We will use `Hmisc::rcorr()` to perform spearman correlations. This function is expecting a `matrix` where rows are samples to correlate and rows are features. Since the feature table has columns for both feature metadata and samples, we need to remove feature metadata for `rcorr()`. We can do this by extracting sample names from the metadata `Injection` column, which match sample column names in the feature table. We also need to remove any samples that no longer have any features post-filtering (likely solvent blanks). - -```{r} -counts <- ft %>% - select(Compound, all_of(get_meta_data(data_filtered)$Injection)) %>% - column_to_rownames(var = "Compound") %>% - select(where( ~ sum(.x) != 0)) - -counts[1:5, 1:2] -``` - -Next we can run the correlation: - -```{r} -counts_cor <- rcorr(as.matrix(counts), type = "spearman") -``` - -The `counts_cor` object is a list of data for the correlations. The correlation coefficients are stored in the `r` slot, and p-values in the `p` slot. We can see the correlations for first sample below - -```{r} -counts_cor$r[ , 1] -``` - -Finally, we can plot the correlations map with `corrplot`. Style options can be customized to your liking (see [corrplot](https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html)). - -```{r} -#| warning: false - -corrplot(counts_cor$r, - type = "lower", - method = "square", - order = "hclust", - col = COL2('BrBG', 10), - tl.col = 'black', - tl.cex = .5) -``` - -### Correlation between technical replicates - -Next, we can look at the correlation between technical replicates to see how they compare across the dataset. For this, we will calculate the average intensity for each compound across technical replicates and run a correlation in the same manner shown above. - -In the original `counts` table, we set the row names from the column `Compound`, which holds the compound id. Here we need to reset out `Compound` column and pivot the data table for calculating averages. - -```{r} -meta <- get_meta_data(data_filtered) - -counts %>% - rownames_to_column(var = "Compound") %>% - pivot_longer(cols = starts_with("102623"), - names_to = "Injection", - values_to = "Intensity") %>% - head() -``` - -Next, we will join with sample meta data so we can cacluate averages by `Sample_Code`. - -```{r} -counts %>% - rownames_to_column(var = "Compound") %>% - pivot_longer(cols = starts_with("102623"), - names_to = "Injection", - values_to = "Intensity") %>% - left_join(meta, by = "Injection") %>% - head() -``` - -Now we can calculate mean intensity for each `Compound` and `Sample_Code`. - -```{r} -counts %>% - rownames_to_column(var = "Compound") %>% - pivot_longer(cols = starts_with("102623"), - names_to = "Injection", - values_to = "Intensity") %>% - left_join(meta, by = "Injection") %>% - summarise(mean_intensity = mean(Intensity), - .by = c(Compound, Sample_Code)) %>% - head() -``` - -Finally, we need to reformat the table so columns are sample codes for the correlation. - -```{r} -#| warning: false - -sample_counts <- counts %>% - rownames_to_column(var = "Compound") %>% - pivot_longer(cols = starts_with("102623"), - names_to = "Injection", - values_to = "Intensity") %>% - left_join(meta, by = "Injection") %>% - summarise(mean_intensity = mean(Intensity), - .by = c(Compound, Sample_Code)) %>% - pivot_wider(names_from = Sample_Code, - values_from = mean_intensity) %>% - column_to_rownames(var = "Compound") - -sample_counts[1:5, 1:5] -``` - -Run the correlation: - -```{r} -sample_counts_cor <- rcorr(as.matrix(sample_counts), type = "spearman") -``` - -And finally visualize: - -```{r} -corrplot(sample_counts_cor$r, - type = "lower", - method = "square", - order = "alphabet", - col = COL2('BrBG', 10), - tl.col = 'black') -``` - - -### Correlation between biological replicates - -We can also look at the correlation between biological groups. In this dataset, technical replicates and biological groups are the same, however if you have multiple technical replicates you may also be interested in the correlation between biological groups. We will calculate mean intensity values for `Biological_Group` in the same manner as we did for `Sample_Code`. - -```{r} -#| warning: false - -group_counts <- counts %>% - rownames_to_column(var = "Compound") %>% - pivot_longer(cols = starts_with("102623"), - names_to = "Injection", - values_to = "Intensity") %>% - left_join(meta, by = "Injection") %>% - summarise(mean_intensity = mean(Intensity), - .by = c(Compound, Biological_Group)) %>% - pivot_wider(names_from = Biological_Group, - values_from = mean_intensity) %>% - column_to_rownames(var = "Compound") -``` - - -Run the correlation analysis with `rorr()`: - -```{r} -group_counts_cor <- rcorr(as.matrix(group_counts), type = "spearman") -``` - -Visualize correlation: - -```{r} -corrplot(group_counts_cor$r, - type = "lower", - method = "square", - order = "alphabet", - col = COL2('BrBG', 10), - tl.col = 'black') -``` - -As expected, the biological group correlation matrix matches the tehcnical replicates correlation. - -# Dendrogram - -You may be interested in visualizing how similar samples are with a dendrogram. For this we will need to calculate the distance between samples with `stats::dist()`, cluster with `hclust()`, and visualize with `ggplot` and `ggdendro` packages. - -First, calculate distance and and cluster: - -```{r} -dist <- stats::dist(t(counts), method = "euclidian") -cluster <- stats::hclust(dist, method = "complete") -``` - -Calculate dendrogram compoents with `stats::as.dendrogram()`: - -```{r} -dendro <- as.dendrogram(cluster) -``` - -Extract plotting elements with `ggdendrogram::dendro_data()` - -```{r} -den_data <- dendro_data(dendro, type = "rectangle") -``` - -Use `ggplot` and `ggdendrogram` to plot the dendrogram. For more customizations, see [ggdendro documentation](https://cran.r-project.org/web/packages/ggdendro/vignettes/ggdendro.html). - -```{r} -ggplot(segment(den_data)) + - geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + - geom_text(data = den_data$labels, - aes(x = x, y = y, label = label), - size = 3, - hjust = "outward") + - coord_cartesian(ylim = c((min(segment(den_data)$y) + 10),(max(segment(den_data)$y)))) + - coord_flip() + - scale_y_reverse(expand = c(.5, 0)) + - theme_dendro() -``` - -# Fold change analysis - -Say we wanted to compare metabolite profiles between our Coculture and ANG18 groups. We can extract group means from the filtered compounds using the mpactR function `get_group_averages()`, isolate the groups of interest, here Coculture and ANG18 monocoluture, and calculate compound fold change. - -### Calculate fold change - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(fc = Coculture / ANG18) %>% - head() -``` - -Inherently, tandem MS/MS datasets can be filled with many zeros. In some instances a coumpound is found in the experimental group but not in the control group, or vice versa. In these cases, the fold change calculation yields an infinite number as there is a zero either in the numerator or denominator ($fold chage = experimental / control$). There is also the chance that the compound is not found in either group, yielding a fold change on NaN (0/0). - -Compounds that are not in either group are of no interest in this comparison and can therefore be removed from the analysis. - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(nonzero_compound = if_else(Coculture == 0 & ANG18 == 0, FALSE, TRUE)) %>% - filter(nonzero_compound == TRUE) %>% - mutate(fc = Coculture / ANG18) %>% - head() -``` - -There mutliple ways to handle compounds that are found in the one group but not the other. Here we will set an upper and lower bounds. - -First, extract group averages from the `mpactr` object and select the two groups of interest. Here we are interested in the columns Coculture and ANG18: - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - head() -``` - -To calculate fold change, we need to have two columns: one for the control group, here ANG18, and one for the experimental group, Coculture: - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - head() -``` - -Now we can remove compounds with an intensity of 0 in both groups: - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(nonzero_compound = if_else(Coculture == 0 & ANG18 == 0, FALSE, TRUE)) %>% - filter(nonzero_compound == TRUE) %>% - head() -``` - -Next, we calculate fold change for all remaining compounds: - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(nonzero_compound = if_else(Coculture == 0 & ANG18 == 0, FALSE, TRUE)) %>% - filter(nonzero_compound == TRUE) %>% - mutate(fc = Coculture / ANG18) %>% - head() -``` - -Now we correct the bounds: for compounds that have 0 intenisty in the control group we will set fold change to 100, and for those with 0 intensity in the experimental group we set the fold change to 0.01. - -```{r} -get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(nonzero_compound = if_else(Coculture == 0 & ANG18 == 0, FALSE, TRUE)) %>% - filter(nonzero_compound == TRUE) %>% - mutate(fc = Coculture / ANG18, - fc = case_when(fc >= 100 ~ 100, - fc <= 0.01 ~ 0.01, - .default = fc)) %>% - head() -``` - -Finally, transform fold change to log2: - -```{r} -foldchanges <- get_group_averages(data_filtered) %>% - filter(Biological_Group == "Coculture" | - Biological_Group == "ANG18") %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(nonzero_compound = if_else(Coculture == 0 & ANG18 == 0, FALSE, TRUE)) %>% - filter(nonzero_compound == TRUE) %>% - mutate(fc = Coculture / ANG18, - fc = case_when(fc >= 100 ~ 100, - fc <= 0.01 ~ 0.01, - .default = fc), - logfc = log2(fc)) - -head(foldchanges) -``` - -### Visualize fold change for compounds in a 3D scatter plot - -We can then probe compounds by plotting a 3D scatter plot of log2 fold changes as a function of m/z and retention time: - -```{r} -fc_plotting <- foldchanges %>% - left_join(select(ft, Compound, mz, rt), by = "Compound") - -plot_ly(fc_plotting, x = ~logfc, y = ~rt, z = ~mz, - marker = list(color = ~logfc, showscale = TRUE)) %>% - add_markers() %>% - layout(scene = list(xaxis = list(title = 'log2 Fold Change'), - yaxis = list(title = 'Retention Time (min)'), - zaxis = list(title = 'm/z')), - annotations = list( - x = 1.13, - y = 1.05, - text = 'log2 Fold Change', - xref = 'paper', - yref = 'paper', - showarrow = FALSE - )) -``` - -### Test for compounds with significant fold changes - -For the t-tests, we have already calculated variation for compounds within biological groups as well as within techincal replicates in biological groups. We can calculate the t statisic and degrees of freedom and use the t distribution (`pt()`) to calculate the p-value. - -*combine biological and technical variaition and calcualte sample size* - -```{r} -# Satterwaite -calc_samplesize_ws <- function(sd1, n1, sd2, n2) { - s1 <- sd1 / (n1^0.5) - s2 <- sd2 / (n2^0.5) - - n <- (s1^2/n1 + s2^2/n2)^2 - d1 <- s1^4 / ((n1^2) * (n1 - 1)) - d2 <- s2^4 / ((n2^2) * (n2 - 1)) - - d1[which(!is.finite(d1))] <- 0 - d2[which(!is.finite(d2))] <- 0 - - d <- d1 + d2 - - return(n / d) -} - -my_comp <- c("Coculture", "ANG18") - -stats <- get_group_averages(data_filtered) %>% - mutate(combRSD = sqrt(techRSD^2 + BiolRSD^2), - combASD = combRSD * average, - combASD = if_else(is.na(combASD), 0, combASD), - BiolRSD = if_else(is.na(BiolRSD), 0, BiolRSD), - techRSD = if_else(is.na(techRSD), 0, techRSD), - neff = calc_samplesize_ws(techRSD, techn, BiolRSD, Bioln) + 1) %>% - filter(Biological_Group %in% my_comp) - -head(stats) -``` - -*calculate the t statitstic* - -```{r} -denom <- stats %>% - summarise(den = combASD^2 / (neff), .by = c("Compound", "Biological_Group")) %>% - mutate(den= if_else(!is.finite(den), 0, den)) %>% - summarise(denom = sqrt(sum(den)), .by = c("Compound")) - -t_test <- stats %>% - select(Compound, Biological_Group, average) %>% - pivot_wider(names_from = Biological_Group, values_from = average) %>% - mutate(numerator = (Coculture - ANG18)) %>% # experimental - control - left_join(denom, by = "Compound") %>% - mutate(t = abs(numerator / denom)) - -head(t_test) -``` - -*calculate degrees of freedom* - -```{r} -df <- stats %>% - select(Compound, Biological_Group, neff) %>% - mutate(neff = if_else(!is.finite(neff), 0, neff)) %>% - pivot_wider(names_from = Biological_Group, values_from = neff) %>% - mutate(deg = Coculture + ANG18 - 2) %>% - select(Compound, deg) - -head(df) -``` - -*calculate p-value* - -```{r} -t <- t_test %>% - left_join(df, by = "Compound") %>% - mutate(p = (1 - pt(t, deg)) * 2, - logp = log10(p), - neg_logp = -logp) %>% - select(Compound, t, deg, p, logp, neg_logp) - -head(t) -``` - -*combine t-test results with fold changes we calculated* - -```{r} -num_ions <- t %>% - filter(p <= 1) %>% - count() %>% - pull() - -fc <- foldchanges %>% - left_join(t, by = "Compound") %>% - arrange(p) %>% - mutate(qval = 1:(length(p)), - qval = p * num_ions / qval) %>% - arrange(desc(p)) - -min_qval <- 1 - -for (i in 1:nrow(fc)) { - if (!is.finite(fc$qval[i])) { - next - } - - if (fc$qval[i] < min_qval) { - min_qval <- fc$qval[i] - } else { - fc$qval[i] <- min_qval - } -} - -fc2 <- fc %>% - mutate(neg_logq = -log10(qval)) - -``` - -### Volcano plot - -Now we can create a volcano plot to see the magnitude of metabolite abundance differences between the Cocoulture and ANG18 groups. - -We can plot log2 fold change values against log~10~ p-values or log~10~ q-values, both of which we calcuated in out t-tests above. - -*log2 fold change ~ log~10~ p-values* - -The base of a volcano plot is a scatter plot with log2 fold changes on the x axis and -log~10~ p-values on the y axis: - -```{r} -fc2 %>% -ggplot() + - aes(x = logfc, y = neg_logp) + - geom_point() + - labs(x = "log2 Fold Change", y = "-Log~10~ *P*", color = "Differential Abundance") + - theme_bw() -``` - -This is good, but we probably want know which compounds are above the p-value threshold and outside the log2 fold change threshold. Fold change threshold commonly ranges from 1.5 - 2. For this plot, we will show a p-value threshold of 0.05 and fold change threshold of 1.5. - -First add add a horizontal line to denote the cutoff of -log~10~ 0.05: - -```{r} -fc2 %>% -ggplot() + - aes(x = logfc, y = neg_logp) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - labs(x = "log2 Fold Change", y = "-Log~10~ *P*", color = "Differential Abundance") + - theme_bw() -``` - -Now we can add vertical lines showing the positive and negative cutoffs for log2 fold change: - -```{r} -fc2 %>% -ggplot() + - aes(x = logfc, y = neg_logp) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - labs(x = "log2 Fold Change", y = "-Log~10~ *P*", color = "Differential Abundance") + - geom_vline(xintercept = log2(1.5), linetype = "dashed") + - geom_vline(xintercept = -log2(1.5), linetype = "dashed") + - theme_bw() -``` - -So now we can see the thresholds, but its difficult to make out which compounds have significant fold change values. It would be nice to color the compounds by their significance. Specifically, compounds whose intensity are signigicantly higher in the experimental group, compounds whose intensity are significantly lower in the experimental group, compounds with significant p-values but the fold change is below the fold change threshold, and compounds whose fold change are not significant. - -We can do this by adding a new variable to our dataset and using this variable to color our points. - -First, add a new variable `sig` which with values `Increased` (fold change > 1.5 & pvalue < 0.05), `Decreased` (fold change < -1.5 & pvalue < 0.05), `Inconclusive` (fold change -1.5 - 1.5 & pvalue < 0.05), and `not_sig` (pvalue > 0.05). - -```{r} -fc2 %>% - mutate(sig = case_when(p > 0.05 ~ "not_sig", - p <= 0.05 & logfc >= log2(1.5) ~ "Increased", - p <= 0.05 & logfc < -log2(1.5) ~ "Decreased", - TRUE ~ "Inconclusive"), - sig = factor(sig, - levels = c("Increased", "Decreased", "Inconclusive", "not_sig"), labels = c("Increased", "Decreased", "Inconclusive", "Not sigificant"))) %>% - select(Compound, ANG18, Coculture, fc, logfc, p, sig) %>% - head() -``` - -Now we can set the color aestheic to the variable `sig` and select custom colors with `scale_color_manual()`: - -```{r} -fc2 %>% - mutate(sig = case_when(p > 0.05 ~ "not_sig", - p <= 0.05 & logfc >= log2(1.5) ~ "Increased", - p <= 0.05 & logfc < -log2(1.5) ~ "Decreased", - TRUE ~ "Inconclusive"), - sig = factor(sig, - levels = c("Increased", "Decreased", "Inconclusive", "not_sig"), labels = c("Increased", "Decreased", "Inconclusive", "Not sigificant"))) %>% - ggplot() + - aes(x = logfc, y = neg_logp, color = sig) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - geom_vline(xintercept = log2(1.5), linetype = "dashed") + - geom_vline(xintercept = -log2(1.5), linetype = "dashed") + - scale_color_manual(values = c("red", "blue", "grey", "black")) + - labs(x = "log2 Fold Change", - y = "-Log~10~ *P*", - color = "Differential Abundance") + - theme_bw() -``` - -Finally, adjust axis text and titles to your liking: - -```{r} -fc2 %>% - mutate(sig = case_when(p > 0.05 ~ "not_sig", - p <= 0.05 & logfc >= log2(1.5) ~ "Increased", - p <= 0.05 & logfc < -log2(1.5) ~ "Decreased", - TRUE ~ "Inconclusive"), - sig = factor(sig, - levels = c("Increased", "Decreased", "Inconclusive", "not_sig"), labels = c("Increased", "Decreased", "Inconclusive", "Not sigificant"))) %>% - ggplot() + - aes(x = logfc, y = neg_logp, color = sig) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - geom_vline(xintercept = log2(1.5), linetype = "dashed") + - geom_vline(xintercept = -log2(1.5), linetype = "dashed") + - scale_color_manual(values = c("red", "blue", "grey", "black")) + - labs(x = "log2 Fold Change", - y = "-Log~10~ *P*", - color = "Differential Abundance") + - theme_bw() + - theme( - axis.title = element_markdown(size = 20), - axis.text = element_text(size = 15) - ) -``` - -Use can also make the plot interactive, showing the compound id for each point: - -```{r} -volcano <- fc2 %>% - mutate(sig = case_when(p > 0.05 ~ "not_sig", - p <= 0.05 & logfc >= log2(1.5) ~ "Increased", - p <= 0.05 & logfc < -log2(1.5) ~ "Decreased", - TRUE ~ "Inconclusive"), - sig = factor(sig, - levels = c("Increased", "Decreased", "Inconclusive", "not_sig"), labels = c("Increased", "Decreased", "Inconclusive", "Not sigificant"))) %>% - ggplot() + - aes(x = logfc, y = neg_logp, color = sig, - text = paste0("Compound: ", Compound)) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - geom_vline(xintercept = log2(1.5), linetype = "dashed") + - geom_vline(xintercept = -log2(1.5), linetype = "dashed") + - scale_color_manual(values = c("red", "blue", "grey", "black")) + - labs(x = "log2 Fold Change", - y = "-Log~10~ *P*", - color = "Differential Abundance") + - theme_bw() + - theme( - axis.title = element_markdown(size = 20), - axis.text = element_text(size = 15) - ) - -ggplotly(volcano) -``` - -*log2 fold change ~ log~10~ q-values* - -You can also look at the volcano plot with the q value to control for false discovery rate (FDR) using the same approch shown for p values: - -```{r} -fc2 %>% - mutate(sig = case_when(qval > 0.05 ~ "not_sig", - qval <= 0.05 & logfc >= log2(1.5) ~ "Increased", - qval <= 0.05 & logfc < -log2(1.5) ~ "Decreased", - TRUE ~ "Inconclusive"), - sig = factor(sig, - levels = c("Increased", "Decreased", "Inconclusive", "not_sig"), labels = c("Increased", "Decreased", "Inconclusive", "Not sigificant"))) %>% - ggplot() + - aes(x = logfc, y = neg_logq, color = sig) + - geom_point() + - geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - geom_vline(xintercept = log2(1.5), linetype = "dashed") + - geom_vline(xintercept = -log2(1.5), linetype = "dashed") + - scale_color_manual(values = c("red", "blue", "grey", "black")) + - labs(x = "log2 Fold Change", - y = "-Log~10~ q-value", - color = "Differential Abundance") + - theme_bw() + - theme( - axis.title = element_markdown(size = 20), - axis.text = element_text(size = 15) - ) -``` diff --git a/vignettes/articles/filter.qmd b/vignettes/articles/filter.qmd deleted file mode 100644 index db9b4f0..0000000 --- a/vignettes/articles/filter.qmd +++ /dev/null @@ -1,392 +0,0 @@ ---- -title: "Filter" -vignette: > - %\VignetteIndexEntry{Filter} - %\VignetteEngine{quarto::html} - %\VignetteEncoding{UTF-8} ---- - -```{r} -#| include: false -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup} -#| output: false -# library(mpactR) -devtools::load_all() -library(tidyverse) -``` - -## Load data into R - -mpactR requires 2 files as imput: a feature table and metadata file. Both are expected to be comma separated files (*.csv*). - -1. peak_table: a peak table in Progenesis format is expected. To export a compatable peak table in Progensis, navigate to the *Review Compounds* tab then File -> Export Compound Measurements. Select the following properties: Compound, m/z, Retention time (min), and Raw abundance and click ok. -2. metadata: a table with sample information. At minimum the following columns are expected: Injection, Sample_Code, Biological_Group. Injection is the sample name and is expected to match sample column names in the peak_table. Sample_Code is the id for technical replicate groups. Biological Group is the id for biological replicate groups. Other sample metadata can be added, and is encouraged for downstream analysis following filtering with mpactR. - -To import these data into R, use the mpactR function `import_data()`, which has the arguments: `peak_table_file_path` and `meta_data_file_path` - -```{r} -data <- import_data(example("cultures_peak_table.csv"), - example("cultures_metadata.csv")) -``` - -This will create an R6 class object, which will store both the peak table and metadata. Note: you do not need to use `example()` if you are using your own files. This is a package function that allows access to internal package data. If you simply want to play around with the packge using example data, you can follow run the `import_data` function as shown above. - -Calling the new mpactr object will print the current peak table in the terminal: - -```{r} -data -``` - -## Accessing data in mpactr object - -You can extract the peak table or metadata at any point with `get_raw_data()`, `get_peak_table()` and `get_meta_data()` functions. Both functions will return a `data.table` object with the corresponding information. - -### Extract peak table - -To extract the raw input peak table with `get_raw_data()`: - -```{r} -get_raw_data(data)[1:5, 1:8] -``` - -The raw peak table will not change as filters are applied to the data. If you want to extract the filtered peak table, use `get_peak_table()`: - -```{r} -get_peak_table(data)[1:5, 1:8] -``` - - -### Extract metadata - - -```{r} -get_meta_data(data)[1:5, ] -``` - -## Filtering - -mpactR provides filters to correct for the following issues observed during preprocessing of tandem MS/MS data: - -- mispicked ions: isotopic patterns that are incorrectly split during preprocessing. -- solvent blank impurities: features overrepresented in a specific group of samples; for example removal of features present is solvent blanks due to carryover between samples. -- non-reproducible ions: those that are inconsistent between technical replicates. -- insource ions: fragment ions created during the first ionization in the tandem MS/MS workflow. - -#### Mispicked ions filter - -To check for mispicked ions, use mpactR function `filter_mispicked_ions()`. This function takes an `mpactr object` as input, and checks for similar ions with the arguments `ringwin`, `isowin`, `trwin` and `max_iso_shift`. - -Ions in the feature table are flagged as similar based on retention time and mass. Flagged ion groups are suggested to be the result of incorrect splitting of isotopic patterns during peak picking, detector saturation artifacts, or incorrect identification of multiply charged oligomers. - -```{r} -data_mispicked <- filter_mispicked_ions(data, ringwin = 0.5, - isowin = 0.01, trwin = 0.005, - max_iso_shift = 3, merge_peaks = TRUE, - merge_method = "sum", - copy_object = TRUE) -``` - -Each filter reports progress of filtering, here we can see that `r length(filter_summary(data_mispicked, "mispicked")$failed_ions) + length(filter_summary(data_mispicked, "mispicked")$passed_ions)`ions were present prior to checking for mispicked ions. `r length(filter_summary(data_mispicked, "mispicked")$failed_ions)` ions were found to be similar to another ion and following merging, `r (length(filter_summary(data_mispicked, "mispicked")$failed_ions) + length(filter_summary(data_mispicked, "mispicked")$passed_ions)) - length(filter_summary(data_mispicked, "mispicked")$failed_ions)` ions remain. - -You can look at which ions were found to similar to one another with the `get_similar_ions()` function. - -```{r} -head(get_similar_ions(data_mispicked)) -``` - -#### Remove ions that are above a threhold in one biological sample group - -Removing solvent blank impurities is important for correcting for between-sample carryover and contamination in experimental samples. You can identify and remove these ions with mpactR's `filter_group()` function. `filter_group()` identifies ions above a relative abundance threshold (`group_threshold`) a specific group (`group_to_remove`). To remove solvent blank impurities set `group_to_remove` to the `Biological_Group` in your metadata file which corresponds to your solvent blank samples, here "Solvent_Blank". - -```{r} -data_blank <- filter_group(data, group_threshold = 0.01, - group_to_remove = "Solvent_Blank", remove_ions = TRUE, - copy_object = TRUE) -``` - -In this example, `r length(filter_summary(data_blank, "group", "Solvent_Blank")$failed_ions) + length(filter_summary(data_blank, "group", "Solvent_Blank")$passed_ions)` ions were present prior to the group filter. `r length(filter_summary(data_blank, "group", "Solvent_Blank")$failed_ions)` ions were found to be above the relative abundance threshold of 0.01 in "Solvent_Blank" samples, leaving `r (length(filter_summary(data_blank, "group", "Solvent_Blank")$failed_ions) + length(filter_summary(data_blank, "group", "Solvent_Blank")$passed_ions)) - length(filter_summary(data_blank, "group", "Solvent_Blank")$failed_ions)` ions in the peak table. - -#### Remove non-reproducible ions - -Ions whose abundance are not consisent between technical replicates (*i.e.*, nonreproducible) may not be reliable for analysis and therefore should be removed from the feature table. Nonreproducible ions are identified by mean or median coefficient of variation (cv) with `filter_cv()`. - -```{r} -data_rep <- filter_cv(data, cv_threshold = 0.2, cv_param = "median", - copy_object = TRUE) -``` - -In our example dataset, `r length(filter_summary(data_rep, "replicability")$failed_ions)` ions were flagged as nonreproducible. These ions were removed, leaving `r (length(filter_summary(data_rep, "replicability")$failed_ions) + length(filter_summary(data_rep, "replicability")$passed_ions)) - length(filter_summary(data_rep, "replicability")$failed_ions)` ions in the feature table. - -If you would like to visualize how the CV threshold performed on your dataset, you can extract the CV calculations using mpactR's `get_cv_data()` function and caclulate the percentage of features for plotting. You can look at both mean and median cv as shown in the example below, or you can filter the data by the parameter of choice. - -```{r} -cv <- get_cv_data(data_rep) %>% - pivot_longer(cols = c("mean_cv", "median_cv"), names_to = "param", values_to = "cv") %>% - nest(.by = param) %>% - mutate(data = map(data, arrange, cv), - data = map(data, mutate, index = 0:(length(cv)-1)), - data = map(data, mutate, index_scale = index * 100 / length(cv))) - -head(cv) -``` - -The nested data are tibbles with the columns `r colnames(cv$data[[1]])`: - -```{r} -#| echo: false - -head(cv$data[[1]]) -``` - -There is one tibble for each parameter (either mean or median). We also want to calculate the percentage of features represented by the CV threshold. - -```{r} -cv_thresh_percent <- cv %>% - filter(param == "median_cv") %>% - unnest(cols = data) %>% - mutate(diff_cv_thresh = abs(cv - 0.2)) %>% - slice_min(diff_cv_thresh, n = 1) %>% - pull(index_scale) - -cv_thresh_percent -``` - - -Then we can plot percentage of features by CV: - -```{r} -#| warning: false -cv %>% - unnest(cols = data) %>% - mutate(param = factor(param, levels = c("mean_cv", "median_cv"), labels = c("mean", "median"))) %>% - ggplot() + - aes(x = cv, y = index_scale, group = param, color = param) + - geom_line(size = 2) + - geom_vline(xintercept = 0.2, color = "darkgrey", linetype = "dashed", size = 1) + - geom_hline(yintercept = cv_thresh_percent, color = "darkgrey", size = 1) + - labs(x = "CV", y = "Percentage of Features", param = "Statistic") + - theme_bw() + - theme( - axis.title = element_text(size = 20), - axis.text = element_text(size = 15), - legend.position = "inside", - legend.position.inside = c(.90, .08), - legend.title = element_blank(), - legend.text = element_text(size = 15) - ) - -``` - -Here we can see that roughly 80% of features were below the CV threshold meaning 20% were removed at a CV theshold of 0.2. - -```{r} -#| include: false -#| eval: false -cv <- cv %>% - mutate(data = map(data, mutate, dist = cv - lag(cv)), - data = map(data, mutate, auc = dist * index_scale)) - -# Replicability statistics -mean_n <- get_meta_data(data_rep) %>% - summarise(count = n_distinct(Injection) / n_distinct(Sample_Code)) %>% - pull() - -stdev <- c(1, 0, 0 * (as.integer(mean_n) - 1)) -mod_stdev <- sd(stdev) / mean(stdev) - -mean_ord_med <- t$data[[2]] %>% - left_join(select(t$data[[1]], Compound, cv), by = "Compound") %>% - rename(cv_median = cv.x, - cv = cv.y) %>% - arrange(cv_median) %>% - mutate(param = "mead_ord_med") %>% - select(param, cv, everything()) - -cv2 <- t %>% - unnest(cols = data) %>% - bind_rows(mean_ord_med) %>% - nest(.by = param) - -index_cv_mean <- function(x, df, sd) { - mean(df$index_scale[which(abs(df$cv - x - sd/200) < sd/200)]) -} - -s <- data.frame(val = 1:(mod_stdev*100)) %>% - mutate(pos = val/100, - meanavg = map_dbl(pos, index_cv_mean, df = cv2$data[[1]], sd = mod_stdev), - meanmed = map_dbl(pos, index_cv_mean, df = cv2$data[[3]], sd = mod_stdev), - skew = abs(meanmed - meanavg), - skew_sd = skew * mod_stdev/100 - ) - -sumskew <- s %>% - summarise(sum = sum(skew_sd, na.rm = TRUE)) %>% - pull(sum) - -auc_groups <- t %>% - unnest(cols = data) %>% - summarise(auc = sum(auc, na.rm = TRUE), .by = param) - -filter(auc_groups, param == "median_cv") %>% pull(auc) - -sumskew_n <- sumskew / ((filter(auc_groups, param == "median_cv") %>% pull(auc) + filter(auc_groups, param == "mean_cv") %>% pull(auc))/2) -rep <- ((filter(auc_groups, param == "median_cv") %>% pull(auc) + filter(auc_groups, param == "mean_cv") %>% pull(auc))/2) / (mod_stdev*100) -qualscore <- (1 - sumskew_n) * rep * 100 - -paste0("Reproducibility: ", round(100*rep, 1), "% ", - "Skewness: ", round(100*sumskew_n, 1), "% ", - "Overall: ", round(qualscore, 1), "%") -``` - -#### Remove insouce fragment ions - -Some ions produce fragments during the first ionization of tandem MS/MS, called insouce ions. This can result in ions from one compound being represented more than once in the feature table. If you would like to remove insource ions framgments, you can do so with mpactR's `filter_insource_ions()`. `filter_insource_ions()` conducts ion decovolution via retention time correlation matrices within MS1 scans. Highly correlated ion groups are determined by the `cluster_threshold` parameter and filtered to remvome the low mass features. The highest mass feature is identified as the likely parent ion and retained in the feature table. - -```{r} -data_insource <- filter_insource_ions(data, cluster_threshold = 0.95, - copy_object = TRUE) -``` - -`r length(filter_summary(data_insource, "insource")$failed_ions)` ions were identified and removed during deconvolution of this dataset, leaving `r (length(filter_summary(data_insource, "insource")$failed_ions) + length(filter_summary(data_insource, "insource")$passed_ions)) - length(filter_summary(data_insource, "insource")$failed_ions)` ions in the feature table. - -#### Chaining filters together - -Filters can be chained in a customizable workflow, shown below. While filters can be chained in any order, we recommend filtering mispicked ions, then solvent blanks, prior to filtering nonrepoducible or insouce ions. This will allow for incorrectly picked peaks to be merged and any contamination/carryover removed prior to identifying nonreproducible and insource fragment ions. Here we also demonstrate the removal of media blank components with the `filter_group()` function. - -```{r} -data <- import_data(example("cultures_peak_table.csv"), - example("cultures_metadata.csv")) - -data_filtered <- filter_mispicked_ions(data, merge_method = "sum") |> - filter_group(group_to_remove = "Solvent_Blank") |> - filter_group(group_to_remove = "Media") |> - filter_cv(cv_threshold = 0.2, cv_param = "median") - -``` - - -## Reference semantics - -Note: mpactR is built on an R6 class-system, meaning it opperates on reference semantics in which data is updated *in-place*. Compared to a shallow copy, where only data pointers are copied, or a deep copy, where the entire data object is copied in memory, any changes to the original data object, regardless if they are assigned to a new object, result in changes to the original data object. We can see this below. - -Where the raw data object has 1303 ions in the feature table: - -```{r} -data2 <- import_data(example("cultures_peak_table.csv"), - example("cultures_metadata.csv")) - -get_peak_table(data2)[ , 1:5] -``` - -Running the `filter_mispicked_ions` filter, with default setting `copy_object = FALSE` (operates on reference semtanics) results in 1233 ions in the feature table: - -```{r} -data2_mispicked <- filter_mispicked_ions(data2, ringwin = 0.5, - isowin = 0.01, trwin = 0.005, - max_iso_shift = 3, merge_peaks = TRUE, - merge_method = "sum", - copy_object = FALSE) - -get_peak_table(data2_mispicked)[ , 1:5] -``` - -Even though we created an object called `data2_mispicked`, the original `data2` object was also updated and now has 1233 ions in the feature table: - -```{r} -get_peak_table(data2)[ , 1:5] -``` - -We recommend using the default `copy_object = FALSE` as this makes for an extremely fast and memory-efficient way to chain mpactR filters together (see **Chaining filters together** section); however, if you would like to run the filters individually with traditional R style objects, you can set `copy_object` to `TRUE` as shown in the filter examples. - -If you are interested in the groups of similar ions flagged in this filter, you can use `get_similar_ions()`. This function returns a `data.table` report the main ion (the ion retained post-merging) and the ions similar to it. - -## Summary - -mpactR offers mutliple ways to view a summary of data filtering. - -#### View passing and failed ions for a single filter - -If you are interested in viewing the passing and failing ions for a single filter, use the `filter_summary()` function. You must specify which filter you are intested in, either "mispicked", "group", "replicability", or "insouce". - -```{r} -mispicked_summary <- filter_summary(data_filtered, filter = "mispicked") -``` - -Failed ions: -```{r} -mispicked_summary$failed_ions -``` - -Passing ions: -```{r} -head(mispicked_summary$passed_ions, 100) -``` - -If you set `filter` to a filter name that you did not apply to your data, a warning message will be returned. - -```{r} -#| error: true -filter_summary(data_filtered, filter = "insource") -``` - -If you want to retrieve the filter summary for the group filter, you must also supply the group name with the `group` argument: - -```{r} -filter_summary(data_filtered, filter = "group", group = "Solvent_Blank") -``` - -`group` must always be supplied for this filter, even if only one group filter has been applied. - -#### View passing and failed ions for all input ions - -You can view the fate of all input ions with the `qc_summary()` function. A data.table reporting the compound id (`compounds`) and if it failed or passed filtering. If the ion failed filtering, its status will report the name of the filter it failed. - -```{r} -head(qc_summary(data_filtered)[order(compounds), ]) -``` - -#### Visualize filtering QC with tree map plot - -You can visualize filtering efforts with a tree map using the filtering summary obtained from `qc_summary()` and the packages `ggplot2` and `treemapify`. - -First we need to determine the number of ions for each ion status in the summary table. You can report the count, however we need to calculate the percent of ions in each group. We have done this in the code chunk below, where we have used `data.table` syntax as the `qc_summary()` returns a `data.table` object. If you are not familar with the package data.table, check out their resources on [gitlab](https://rdatatable.gitlab.io/data.table/). - -```{r} -library(ggplot2) -library(treemapify) - -ion_counts <- qc_summary(data_filtered)[ , .(count = .N), by = status][ - , percent := (count / sum(count) * 100)] -``` - -Finally, we plot the treemap: - -```{r} -tm <- ggplot(ion_counts) + - aes(area = percent, fill = status) + - geom_treemap() + - geom_treemap_text(aes(label = paste(status, paste0(round(percent, 2), "%"), sep = "\n"), - fontface = c("bold"))) - -tm -``` - -This plot can be customized with ggplot2, for example we no longer need the legend and maybe we want custom colors: - -```{r} -tm + - scale_fill_brewer(palette = "Greens") + - theme(legend.position = "none") -``` - -If you want a fast visualization of the treemap, you can pass the mpactr object to the function `plot_qc_tree()`. - -```{r} -plot_qc_tree(data_filtered) -``` diff --git a/vignettes/articles/filter2.Rmd b/vignettes/articles/filter2.Rmd new file mode 100644 index 0000000..5afcb5c --- /dev/null +++ b/vignettes/articles/filter2.Rmd @@ -0,0 +1,464 @@ +--- +title: "Filter with mpact original data" +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, message=FALSE} +# library(mpactR) +devtools::load_all() +library(tidyverse) +``` + +## Load data into R + +mpactR requires 2 files as imput: a feature table and metadata file. Both are expected to be comma separated files (*.csv*). + +1. peak_table: a peak table in Progenesis format is expected. To export a compatable peak table in Progenesis, navigate to the *Review Compounds* tab then File -> Export Compound Measurements. Select the following properties: Compound, m/z, Retention time (min), and Raw abundance and click ok. +2. metadata: a table with sample information. At minimum the following columns are expected: Injection, Sample_Code, Biological_Group. Injection is the sample name and is expected to match sample column names in the peak_table. Sample_Code is the id for technical replicate groups. Biological_Group is the id for biological replicate groups. Other sample metadata can be added, and is encouraged for downstream analysis following filtering with mpactR. + +To import these data into R, use the mpactR function `import_data()`, which has the arguments: `peak_table_file_path` and `meta_data_file_path`. This tutorial will show you examples with data from the original mpact program, found on [GitHub](https://github.com/BalunasLab/mpact/tree/main/rawdata/PTY087I2). This dataset contain 38 samples for biological groups solvent blanks, media blanks, Streptomyces sp. PTY08712 grown with (250um_Ce) and without (0um_Ce) rare earth element cerium. For more information about the experiments conducted see [Samples, Puckett, and Balunas 2023](https://pubs.acs.org/doi/abs/10.1021/acs.analchem.2c04632). + +The original program has sample metadata split across two files: the sample list and metadata. mpactR accepts a single file, so we need to combine these in one prior to import with `import_data()`. + +Load the sample list and metadata files. + +```{r} +samplelist <- read_csv("https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/samplelist.csv") +metadata <- read_csv("https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/extractmetadata.csv") +``` + + +Our sample list contains additional blank samples that are not in the feature table, and therefore should be removed prior to import. + +```{r} +samples <- colnames(read_csv("https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/200826_PTY087I2codingdataset.csv", skip = 2)) %>% str_subset(., "200826") + +meta_data <- samplelist %>% + left_join(metadata, by = "Sample_Code") %>% + filter(Injection %in% samples) +``` + +Now we can import the data. We will provide the url for the `peak_table`, and our reformatted meta_data object. This peak table was exported from Progenesis, so we will set the `format` argument to Progenesis. + +```{r} +data <- import_data(peak_table = "https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/200826_PTY087I2codingdataset.csv", + meta_data = meta_data, + format = "Progenesis" +) +``` + +This will create an R6 class object, which will store both the peak table and metadata. + +Calling the new mpactr object will print the current peak table in the terminal: + +```{r} +data +``` + +## Accessing data in mpactr object + +You can extract the peak table or metadata at any point with `get_raw_data()`, `get_peak_table()` and `get_meta_data()` functions. Both functions will return a `data.table` object with the corresponding information. + +### Extract peak table + +The raw peak table is the unfiltered peak table used as input to mpactR. To extract the raw input peak table, use the function `get_raw_data()`. + +```{r} +get_raw_data(data)[1:5, 1:8] +``` + +The raw peak table will not change as filters are applied to the data. If you want to extract the filtered peak table, with filters that have been applied, use `get_peak_table()`: + +```{r} +get_peak_table(data)[1:5, 1:8] +``` + +### Extract metadata + +Metadata can be accessed with `get_meta_data()`: + +```{r} +get_meta_data(data)[1:5, ] +``` + +## Reference semantics + +mpactR is built on an R6 class-system, meaning it operates on reference semantics in which data is updated *in-place*. Compared to a shallow copy, where only data pointers are copied, or a deep copy, where the entire data object is copied in memory, any changes to the original data object, regardless if they are assigned to a new object, result in changes to the original data object. We can see this below. + +Where the raw data object has `r nrow(get_peak_table(data))` ions in the feature table: + +```{r} +data2 <- import_data(peak_table = "https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/200826_PTY087I2codingdataset.csv", + meta_data = meta_data, + format = "Progenesis" +) + +get_peak_table(data2)[, 1:5] +``` + +We can run the `filter_mispicked_ions` filter, with default setting `copy_object = FALSE` (operates on reference semantics). + +```{r} +data2_mispicked <- filter_mispicked_ions(data2, + ringwin = 0.5, + isowin = 0.01, trwin = 0.005, + max_iso_shift = 3, merge_peaks = TRUE, + merge_method = "sum", + copy_object = FALSE +) + +get_peak_table(data2_mispicked)[, 1:5] +``` + +This results in `r nrow(get_peak_table(data2_mispicked))` ions in the feature table (above). Even though we created an object called `data2_mispicked`, the original `data2` object was also updated and now has `r nrow(get_peak_table(data2))` ions in the feature table: + +```{r} +get_peak_table(data2)[, 1:5] +``` + +We recommend using the default `copy_object = FALSE` as this makes for an extremely fast and memory-efficient way to chain mpactR filters together (see **Chaining filters together** section and [Reference Semantics](articles/reference_semantics.html)); however, if you would like to run the filters individually with traditional R style objects, you can set `copy_object` to `TRUE` as shown in the filter examples. + +## Filtering + +mpactR provides filters to correct for the following issues observed during preprocessing of tandem MS/MS data: + +- mispicked ions: isotopic patterns that are incorrectly split during preprocessing. +- solvent blank contamination: removal of features present in solvent blanks due to carryover between samples. +- background components: features whose abundance is greater than user-defined abundance threshold in a specific group of samples, for example media blanks. +- non-reproducible ions: those that are inconsistent between technical replicates. +- insource ions: fragment ions created during ionization before fragmentation in the tandem MS/MS workflow. + +#### Mispicked ions filter + +To check for mispicked ions, use mpactR function `filter_mispicked_ions()`. This function takes an `mpactr object` as input, and checks for similar ions with the arguments `ringwin`, `isowin`, `trwin` and `max_iso_shift`. + +Ions in the feature table are flagged as similar based on retention time and mass. Flagged ion groups are suggested to be the result of incorrect splitting of isotopic patterns during peak picking, detector saturation artifacts, or incorrect identification of multiply charged oligomers. + +```{r} +data_mispicked <- filter_mispicked_ions(data, + ringwin = 0.5, + isowin = 0.01, trwin = 0.005, + max_iso_shift = 3, merge_peaks = TRUE, + merge_method = "sum", + copy_object = TRUE +) +``` + +Each filter reports progress of filtering, here we can see that `r length(filter_summary(data_mispicked, "mispicked")$failed_ions) + length(filter_summary(data_mispicked, "mispicked")$passed_ions)` ions were present prior to checking for mispicked ions. `r length(filter_summary(data_mispicked, "mispicked")$failed_ions)` ions were found to be similar to another ion and following merging, `r (length(filter_summary(data_mispicked, "mispicked")$failed_ions) + length(filter_summary(data_mispicked, "mispicked")$passed_ions)) - length(filter_summary(data_mispicked, "mispicked")$failed_ions)` ions remain. + +If you are interested in the groups of similar ions flagged in this filter, you can use `get_similar_ions()`. This function returns a `data.table` reporting the main ion (the ion retained post-merging) and the ions similar to it. + +```{r} +head(get_similar_ions(data_mispicked)) +``` + +#### Remove ions that are above a threshold in one biological group + +Removing solvent blank impurities is important for correcting for between-sample carryover and contamination in experimental samples. You can identify and remove these ions with mpactR's `filter_group()` function. `filter_group()` identifies ions above a relative abundance threshold (`group_threshold`) in a specific group (`group_to_remove`). To remove solvent blank impurities, set `group_to_remove` to the `Biological_Group` in your metadata file which corresponds to your solvent blank samples, here "Blanks". + +```{r} +data_blank <- filter_group(data, + group_threshold = 0.01, + group_to_remove = "Blanks", remove_ions = TRUE, + copy_object = TRUE +) +``` + +In this example, `r length(filter_summary(data_blank, "group", "Blanks")$failed_ions) + length(filter_summary(data_blank, "group", "Blanks")$passed_ions)` ions were present prior to the group filter. `r length(filter_summary(data_blank, "group", "Blanks")$failed_ions)` ions were found to be above the relative abundance threshold of 0.01 in "Solvent_Blank" samples, leaving `r (length(filter_summary(data_blank, "group", "Blanks")$failed_ions) + length(filter_summary(data_blank, "group", "Blanks")$passed_ions)) - length(filter_summary(data_blank, "group", "Blanks")$failed_ions)` ions in the peak table. + +We can also use this filter to remove ions from other groups, such as media blanks. This can be useful for experiments on cell cultures. The example data contains samples belonging to the `Biological_Group` "Media". These samples are from media blanks, which are negative controls from the growth experiments conducted in this study. We can remove features whose abundance is greater than 1% of the largest group in media blank samples by specifying `group_to_remove` = "Media". We recommend removing media blank ions following all other filters so all high-quality ions are identified (see Chaining filters together below). + +```{r} +data_media_blank <- filter_group(data, + group_threshold = 0.01, + group_to_remove = "Media", remove_ions = TRUE, + copy_object = TRUE +) +``` + +#### Remove non-reproducible ions + +Ions whose abundance are not consisent between technical replicates (*i.e.*, non-reproducible) may not be reliable for analysis and therefore should be removed from the feature table. Non-reproducible ions are identified by mean or median coefficient of variation (CV) between technical replicates with `filter_cv()`. Note - this filter cannot be applied to data that does not contain technical replicates. + +```{r} +data_rep <- filter_cv(data, + cv_threshold = 0.2, cv_param = "median", + copy_object = TRUE +) +``` + +In our example dataset, `r length(filter_summary(data_rep, "replicability")$failed_ions)` ions were flagged as non-reproducible. These ions were removed, leaving `r (length(filter_summary(data_rep, "replicability")$failed_ions) + length(filter_summary(data_rep, "replicability")$passed_ions)) - length(filter_summary(data_rep, "replicability")$failed_ions)` ions in the feature table. + +If you would like to visualize how the CV threshold performed on your dataset, you can extract the CV calculated during `filer_cv()` using mpactR's `get_cv_data()` function, and calculate the percentage of features for plotting. You can look at both mean and median CV as shown in the example below, or you can filter the data by the parameter of choice. + +```{r} +cv <- get_cv_data(data_rep) %>% + pivot_longer(cols = c("mean_cv", "median_cv"), + names_to = "param", + values_to = "cv") %>% + nest(.by = param) %>% + mutate( + data = map(data, arrange, cv), + data = map(data, mutate, index = 0:(length(cv) - 1)), + data = map(data, mutate, index_scale = index * 100 / length(cv)) + ) + +head(cv) +``` + +The nested data are tibbles with the columns `r colnames(cv$data[[1]])`: + +```{r, echo=FALSE} +head(cv$data[[1]]) +``` + +There is one tibble for each parameter (either mean or median). We also want to calculate the percentage of features represented by the CV threshold. + +```{r} +cv_thresh_percent <- cv %>% + filter(param == "median_cv") %>% + unnest(cols = data) %>% + mutate(diff_cv_thresh = abs(cv - 0.2)) %>% + slice_min(diff_cv_thresh, n = 1) %>% + pull(index_scale) + +cv_thresh_percent +``` + + +Then we can plot percentage of features by CV: + +```{r} +cv %>% + unnest(cols = data) %>% + mutate(param = factor(param, levels = c("mean_cv", "median_cv"), + labels = c("mean", "median"))) %>% + ggplot() + + aes(x = cv, y = index_scale, group = param, color = param) + + geom_line(linewidth = 2) + + geom_vline(xintercept = 0.2, + color = "darkgrey", + linetype = "dashed", + linewidth = 1) + + geom_hline(yintercept = cv_thresh_percent, + color = "darkgrey", + linewidth = 1) + + labs(x = "CV", + y = "Percentage of Features", + param = "Statistic") + + theme_bw() + + theme( + axis.title = element_text(size = 20), + axis.text = element_text(size = 15), + legend.position = "inside", + legend.position.inside = c(.90, .08), + legend.title = element_blank(), + legend.text = element_text(size = 15) + ) +``` + +Here we can see that roughly `r round(cv_thresh_percent, 0)`% of features were below the CV threshold meaning `r 100 - round(cv_thresh_percent, 0)`% were removed at a CV threshold of 0.2. + +```{r, include=FALSE, eval=FALSE} +# additional statistics calculated in the original mpact +# program that are not needed to plot cv. This will not run +# as written, but included as notes +# cv <- cv %>% +# mutate( +# data = map(data, mutate, dist = cv - lag(cv)), +# data = map(data, mutate, auc = dist * index_scale) +# ) + +# # Replicability statistics +# mean_n <- get_meta_data(data_rep) %>% +# summarise(count = n_distinct(Injection) / n_distinct(Sample_Code)) %>% +# pull() + +# stdev <- c(1, 0, 0 * (as.integer(mean_n) - 1)) +# mod_stdev <- sd(stdev) / mean(stdev) + +# mean_ord_med <- t$data[[2]] %>% +# left_join(select(t$data[[1]], Compound, cv), by = "Compound") %>% +# rename( +# cv_median = cv.x, +# cv = cv.y +# ) %>% +# arrange(cv_median) %>% +# mutate(param = "mead_ord_med") %>% +# select(param, cv, everything()) + +# cv2 <- t %>% +# unnest(cols = data) %>% +# bind_rows(mean_ord_med) %>% +# nest(.by = param) + +# index_cv_mean <- function(x, df, sd) { +# mean(df$index_scale[which(abs(df$cv - x - sd / 200) < sd / 200)]) +# } + +# s <- data.frame(val = 1:(mod_stdev * 100)) %>% +# mutate( +# pos = val / 100, +# meanavg = map_dbl(pos, index_cv_mean, df = cv2$data[[1]], sd = mod_stdev), +# meanmed = map_dbl(pos, index_cv_mean, df = cv2$data[[3]], sd = mod_stdev), +# skew = abs(meanmed - meanavg), +# skew_sd = skew * mod_stdev / 100 +# ) + +# sumskew <- s %>% +# summarise(sum = sum(skew_sd, na.rm = TRUE)) %>% +# pull(sum) + +# auc_groups <- t %>% +# unnest(cols = data) %>% +# summarise(auc = sum(auc, na.rm = TRUE), .by = param) + +# filter(auc_groups, param == "median_cv") %>% pull(auc) + +# sumskew_n <- sumskew / ((filter(auc_groups, param == "median_cv") %>% pull(auc) + filter(auc_groups, param == "mean_cv") %>% pull(auc)) / 2) +# rep <- ((filter(auc_groups, param == "median_cv") %>% pull(auc) + filter(auc_groups, param == "mean_cv") %>% pull(auc)) / 2) / (mod_stdev * 100) +# qualscore <- (1 - sumskew_n) * rep * 100 + +# paste0( +# "Reproducibility: ", round(100 * rep, 1), "% ", +# "Skewness: ", round(100 * sumskew_n, 1), "% ", +# "Overall: ", round(qualscore, 1), "%" +#) +``` + +#### Remove insource fragment ions + +Some mass species can be fragmented during ionization in tandem MS/MS, creating insource ions. This can result in ions from one compound being represented more than once in the feature table. If you would like to remove insource ions fragments, you can do so with mpactR's `filter_insource_ions()`. `filter_insource_ions()` conducts ion deconvolution via retention time correlation matrices within MS1 scans. Highly correlated ion groups are determined by the `cluster_threshold` parameter and filtered to remove the low mass features. The highest mass feature is identified as the likely precursor ion and retained in the feature table. + +```{r} +data_insource <- filter_insource_ions(data, + cluster_threshold = 0.95, + copy_object = TRUE +) +``` + +`r length(filter_summary(data_insource, "insource")$failed_ions)` ions were identified and removed during deconvolution of this dataset, leaving `r (length(filter_summary(data_insource, "insource")$failed_ions) + length(filter_summary(data_insource, "insource")$passed_ions)) - length(filter_summary(data_insource, "insource")$failed_ions)` ions in the feature table. + +#### Chaining filters together + +Filters can be chained in a customizable workflow, shown below. While filters can be chained in any order, we recommend filtering mispicked ions, then solvent blanks, prior to filtering non-repoducible or insource ions. This will allow for incorrectly picked peaks to be merged and any contamination/carryover removed prior to identifying non-reproducible and insource fragment ions. Here we also demonstrate the removal of media blank components with the `filter_group()` function after identification of high-quality ions. + +```{r} +data <- import_data(peak_table = "https://raw.githubusercontent.com/BalunasLab/mpact/main/rawdata/PTY087I2/200826_PTY087I2codingdataset.csv", + meta_data = meta_data, + format = "Progenesis" +) + +data_filtered <- filter_mispicked_ions(data, merge_method = "sum") |> + filter_group(group_to_remove = "Blanks") |> + filter_cv(cv_threshold = 0.2, cv_param = "median") |> + filter_group(group_to_remove = "Media") +``` + +## Summary + +mpactR offers mutliple ways to view a summary of data filtering. + +#### View passing and failed ions for a single filter + +If you are interested in viewing the passing and failing ions for a single filter, use the `filter_summary()` function. You must specify which filter you are intested in, either "mispicked", "group", "replicability", or "insource". + +```{r} +mispicked_summary <- filter_summary(data_filtered, filter = "mispicked") +``` + +Failed ions: +```{r} +head(mispicked_summary$failed_ions, 100) +``` + +Passing ions: +```{r} +head(mispicked_summary$passed_ions, 100) +``` + +If you set `filter` to a filter name that you did not apply to your data, an error message will be returned. + +```{r, error=TRUE} +filter_summary(data_filtered, filter = "insource") +``` + +If you want to retrieve the filter summary for the group filter, you must also supply the group name with the `group` argument: + +```{r, eval = FALSE} +filter_summary(data_filtered, filter = "group", group = "Blanks") +``` + +`group` must always be supplied for this filter, even if only one group filter has been applied. + +#### View passing and failed ions for all input ions + +You can view the filtering status of all input ions with the `qc_summary()` function. A data.table reporting the compound id (`compounds`) and if it failed or passed filtering. If the ion failed filtering, its status will report the name of the filter it failed. + +```{r} +head(qc_summary(data_filtered)[order(compounds), ]) +``` + +#### Visualize filtering QC with tree map plot + +You can visualize filtering results with a tree map using the filtering summary obtained from `qc_summary()` and the packages `ggplot2` and `treemapify`. + +First, we need to determine the number of ions for each ion status in the summary table. You can report the count; however, we need to calculate the percent of ions in each group. We have done this in the code chunk below, where we have used `data.table` syntax as the `qc_summary()` returns a `data.table` object. If you are not familiar with the package data.table, check out their resources on [gitlab](https://rdatatable.gitlab.io/data.table/). + +```{r} +library(ggplot2) +library(treemapify) + +ion_counts <- qc_summary(data_filtered)[, .(count = .N), by = status][ + , percent := (count / sum(count) * 100) +] +``` + +Finally, we plot the treemap: + +```{r} +tm <- ggplot(ion_counts) + + aes(area = percent, fill = status) + + geom_treemap() + + geom_treemap_text(aes( + label = paste(status, count, paste0("(", round(percent, 2), "%)"), + sep = "\n"), + fontface = c("bold") + )) + +tm +``` + +This plot can be customized with ggplot2, for example we only want to display the percentage: + +```{r} +tm <- ggplot(ion_counts) + + aes(area = percent, fill = status) + + geom_treemap() + + geom_treemap_text(aes( + label = paste(status, paste0("(", round(percent, 2), "%)"), sep = "\n"), + fontface = c("bold") + )) + +tm +``` + +Or you no longer need the legend and maybe we want custom colors: + +```{r} +tm + + scale_fill_brewer(palette = "Greens") + + theme(legend.position = "none") +``` + +If you want a fast visualization of the treemap, you can pass the mpactr object to the function `plot_qc_tree()`. + +```{r} +plot_qc_tree(data_filtered) +``` From 250deed4a049623c4a2c3f438ffa2e4b2b6a51ed Mon Sep 17 00:00:00 2001 From: YoungKrug <31873199+YoungKrug@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:21:50 -0400 Subject: [PATCH 2/5] Fix for examples, and fixing lintr issues. --- R/filters.R | 7 +- man/filter_cv.Rd | 7 +- man/filter_mispicked_ions.Rd | 1 - tests/testthat/test-filter_pactr-class.R | 220 ++++++++++++++++------- 4 files changed, 164 insertions(+), 71 deletions(-) diff --git a/R/filters.R b/R/filters.R index 81dd63a..887c51b 100644 --- a/R/filters.R +++ b/R/filters.R @@ -227,12 +227,14 @@ filter_group <- function(mpactr_object, #' #' data_filter <- filter_cv(data, #' cv_threshold = 0.01, -#' cv_param = "mean" +#' cv_param = "mean", +#' copy_object = TRUE #' ) #' #' data_filter <- filter_cv(data, #' cv_threshold = 0.01, -#' cv_param = "median" +#' cv_param = "median", +#' copy_object = TRUE #' ) #' filter_cv <- function(mpactr_object, @@ -259,7 +261,6 @@ filter_cv <- function(mpactr_object, return(mpactr_object) } - ########################### ### Insource ions filter ## ########################### diff --git a/man/filter_cv.Rd b/man/filter_cv.Rd index 9bedf7c..d8ba0aa 100644 --- a/man/filter_cv.Rd +++ b/man/filter_cv.Rd @@ -5,7 +5,6 @@ \title{Filter Non-reproducible ions} \usage{ filter_cv(mpactr_object, cv_threshold = NULL, cv_param, copy_object = FALSE) -filter_cv(mpactr_object, cv_threshold = NULL, cv_param, copy_object = FALSE) } \arguments{ \item{mpactr_object}{An \code{mpactr_object}. See \code{\link[=import_data]{import_data()}}.} @@ -51,12 +50,14 @@ data <- import_data(example("coculture_peak_table.csv"), data_filter <- filter_cv(data, cv_threshold = 0.01, - cv_param = "mean" + cv_param = "mean", + copy_object = TRUE ) data_filter <- filter_cv(data, cv_threshold = 0.01, - cv_param = "median" + cv_param = "median", + copy_object = TRUE ) } diff --git a/man/filter_mispicked_ions.Rd b/man/filter_mispicked_ions.Rd index 4cb12a8..82f3678 100644 --- a/man/filter_mispicked_ions.Rd +++ b/man/filter_mispicked_ions.Rd @@ -12,7 +12,6 @@ filter_mispicked_ions( max_iso_shift = 3, merge_peaks = TRUE, merge_method = "sum", - merge_method = "sum", copy_object = FALSE ) } diff --git a/tests/testthat/test-filter_pactr-class.R b/tests/testthat/test-filter_pactr-class.R index 7ec0d0a..2cd7154 100644 --- a/tests/testthat/test-filter_pactr-class.R +++ b/tests/testthat/test-filter_pactr-class.R @@ -341,109 +341,201 @@ test_that("is_filter_run correctly assesses if a filter has been run", { expect_false(filter_class$is_filter_run(filter = "insource")) }) -test_that("get_log returns an error when an incorrect fitler argument is provided", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) - mpactr_class$setup() - filter_class <- filter_pactr$new(mpactr_class) - - expect_error(filter_class$get_log(filter = "cv"), "`filter` must be one of mpactR's") -}) +test_that("get_log returns an error when +an incorrect fitler argument is provided", { + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) + mpactr_class$setup() + filter_class <- filter_pactr$new(mpactr_class) + expect_error(filter_class$get_log(filter = "cv"), + "`filter` must be one of mpactR's") + }) -test_that("get_log returns an error when the fitler argument provided has not yet been run (e.g., not in the log)", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) - mpactr_class$setup() - filter_class <- filter_pactr$new(mpactr_class) - - expect_error(filter_class$get_log(filter = "mispicked"), "`filter` mispicked has not yet been applied to the data") -}) +test_that("get_log returns an error when the fitler + argument provided has not yet been run (e.g., not in the log)", { + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) + mpactr_class$setup() + filter_class <- filter_pactr$new(mpactr_class) + err_msg <- "`filter` mispicked has not yet been applied to the data" + expect_error(filter_class$get_log(filter = "mispicked"), err_msg) + }) test_that("get_log returns the correct fitler summary list", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) mpactr_class$setup() filter_class <- filter_pactr$new(mpactr_class) - filter_class$check_mismatched_peaks(ringwin = 0.5, isowin = 0.01, trwin = 0.005, max_iso_shift = 3, merge_peaks = - TRUE, merge_method = "sum") - + filter_class$check_mismatched_peaks(ringwin = 0.5, + isowin = 0.01, + trwin = 0.005, + max_iso_shift = 3, + merge_peaks = TRUE, + merge_method = "sum") mispicked_summary <- filter_class$get_log(filter = "mispicked") - expect_type(mispicked_summary, "list") expect_equal(length(mispicked_summary), 2) expect_equal(length(mispicked_summary$passed_ions), 1233) }) -test_that("get_mispicked_ions returns error if check_mismatched_peaks has not been called", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) - mpactr_class$setup() - filter_class <- filter_pactr$new(mpactr_class) - - expect_error(filter_class$get_mispicked_ions(), "The mispicked filter has not yet been") -}) +test_that("get_mispicked_ions returns error if +check_mismatched_peaks has not been called", { + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) + mpactr_class$setup() + filter_class <- filter_pactr$new(mpactr_class) -test_that("get_mispicked_ions correctly returns the check_mismatched_peaks list", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) - mpactr_class$setup() - filter_class <- filter_pactr$new(mpactr_class) - filter_class$check_mismatched_peaks(ringwin = 0.5, isowin = 0.01, trwin = 0.005, max_iso_shift = 3, merge_peaks = - TRUE, merge_method = "sum") - - mispicked_groups <- filter_class$get_mispicked_ions() - - expect_equal(class(mispicked_groups), c("data.table", "data.frame")) - expect_equal(length(mispicked_groups), 2) - expect_equal(names(mispicked_groups), c("main_ion", "similar_ions")) -}) + expect_error(filter_class$get_mispicked_ions(), + "The mispicked filter has not yet been") + }) + +test_that("get_mispicked_ions correctly +returns the check_mismatched_peaks list", { + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) + mpactr_class$setup() + filter_class <- filter_pactr$new(mpactr_class) + filter_class$check_mismatched_peaks(ringwin = 0.5, + isowin = 0.01, + trwin = 0.005, + max_iso_shift = 3, + merge_peaks = TRUE, + merge_method = "sum") + + mispicked_groups <- filter_class$get_mispicked_ions() + + expect_equal(class(mispicked_groups), c("data.table", "data.frame")) + expect_equal(length(mispicked_groups), 2) + expect_equal(names(mispicked_groups), c("main_ion", "similar_ions")) + }) test_that("get_group_averages calculates a group table", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) mpactr_class$setup() filter_class <- filter_pactr$new(mpactr_class) - - filter_class$check_mismatched_peaks(ringwin = 0.5, isowin = 0.01, trwin = 0.005, max_iso_shift = 3, merge_peaks = - TRUE, merge_method = "sum") - + filter_class$check_mismatched_peaks(ringwin = 0.5, + isowin = 0.01, + trwin = 0.005, + max_iso_shift = 3, + merge_peaks = TRUE, + merge_method = "sum") avgs <- filter_class$get_group_averages() expect_equal(class(avgs), c("data.table", "data.frame")) expect_equal(nrow(avgs), (1233 * 6)) - - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) + + mpactr_class <- mpactr$new( + pt_list, + meta + ) mpactr_class$setup() filter_class <- filter_pactr$new(mpactr_class) - - filter_class$check_mismatched_peaks(ringwin = 0.5, isowin = 0.01, trwin = 0.005, max_iso_shift = 3, merge_peaks = - TRUE, merge_method = "sum") + + filter_class$check_mismatched_peaks(ringwin = 0.5, + isowin = 0.01, + trwin = 0.005, + max_iso_shift = 3, + merge_peaks = TRUE, + merge_method = "sum") filter_class$filter_blank() filter_class$parse_ions_by_group(group_threshold = 0.01) filter_class$apply_group_filter("Blanks", remove_ions = TRUE) - + avgs <- filter_class$get_group_averages() expect_equal(class(avgs), c("data.table", "data.frame")) expect_equal(nrow(avgs), (484 * 6)) }) test_that("get_cv returns the cv filter has been applied", { - mpactr_class <- mpactr$new(test_path("exttestdata","102623_peaktable_coculture_simple.csv"), - test_path("exttestdata", "102623_metadata_correct.csv")) + directory <- "exttestdata" + peak_table_name <- "102623_peaktable_coculture_simple.csv" + meta_data_name <- "102623_metadata_correct.csv" + meta <- data.table(read_csv(test_path(directory, + meta_data_name), + show_col_types = FALSE)) + pt_list <- progenesis_formatter(test_path(directory, + peak_table_name)) + mpactr_class <- mpactr$new( + pt_list, + meta + ) mpactr_class$setup() filter_class <- filter_pactr$new(mpactr_class) - filter_class$check_mismatched_peaks(ringwin = 0.5, isowin = 0.01, trwin = 0.005, max_iso_shift = 3, merge_peaks = - TRUE, merge_method = "sum") + filter_class$check_mismatched_peaks(ringwin = 0.5, + isowin = 0.01, + trwin = 0.005, + max_iso_shift = 3, + merge_peaks = TRUE, + merge_method = "sum") filter_class$filter_blank() filter_class$parse_ions_by_group(group_threshold = 0.01) filter_class$apply_group_filter("Blanks", remove_ions = TRUE) - # filter_class$get_cv() expect_error(filter_class$get_cv(), "The cv filter has not yet") - + filter_class$cv_filter(cv_threshold = 0.2, cv_params = c("mean")) - + cv <- filter_class$get_cv() expect_equal(class(cv), c("data.table", "data.frame")) }) \ No newline at end of file From 5e62aa2c98e59d3b0667a8cb81ee99c7326a6feb Mon Sep 17 00:00:00 2001 From: amason30 Date: Thu, 8 Aug 2024 14:18:16 -0400 Subject: [PATCH 3/5] Fixed failed test issues. --- R/mpactr-methods.R | 1 + tests/testthat/test-filter_pactr-methods.R | 19 ++++++++++--------- tests/testthat/test-filters.R | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/R/mpactr-methods.R b/R/mpactr-methods.R index 4245915..3cbb6ff 100644 --- a/R/mpactr-methods.R +++ b/R/mpactr-methods.R @@ -6,6 +6,7 @@ mpactr$set("private", "initialize_data", function() { private$peak_table[, .SD, .SDcols = private$meta_data$Injection] ) > 0), ] private$set_kmd() + private$peak_table$Compound <- as.character(private$peak_table$Compound) }) mpactr$set("private", "set_kmd", function() { private$peak_table[, kmd := mz - floor(mz)] diff --git a/tests/testthat/test-filter_pactr-methods.R b/tests/testthat/test-filter_pactr-methods.R index ee80dea..02148c2 100644 --- a/tests/testthat/test-filter_pactr-methods.R +++ b/tests/testthat/test-filter_pactr-methods.R @@ -29,7 +29,7 @@ properly with filter_pactr-class data", { "cut_ions.csv"), col_names = c("V1"), show_col_types = FALSE) - expected_cut_ions <- as.integer(expected_cut_ions$V1) + expected_cut_ions <- as.character(expected_cut_ions$V1) logger_index_name <- "check_mismatched_peaks" expect_equal(filter_class$logger[[logger_index_name]][["cut_ions"]], expected_cut_ions) @@ -105,10 +105,11 @@ test_that("blank filter works correctly", { grp_avg <- "102623_peaktable_coculture_simple_groupaverages.csv" test_path(directory, grp_avg) - error_prop <- read_csv(test_path(directory, grp_avg), + error_prop <- as.data.table(read_csv(test_path(directory, grp_avg), show_col_types = FALSE, skip = 1, col_names = c("Compound", "mz", "rt", "biologicalGroup", "average") - ) + ))[, Compound := as.character(Compound)] + setorder(error_prop, Compound) logger_index_name <- "group_filter-group_stats" expect_true(all(filter_class$logger[[logger_index_name]]$Biological_Group %in% error_prop$biologicalGroup)) @@ -164,17 +165,17 @@ test_that("parse_ions_by_group flags the correct ions", { expect_false(all(sapply(group_filter_list, is.null))) expect_true(all(group_filter_list$`ANG18 monoculture` - == as.character(ang_18$V1))) + %in% as.character(ang_18$V1))) expect_true(all(group_filter_list$`ANGDT monoculture` - == as.character(angdt$V1))) + %in% as.character(angdt$V1))) expect_true(all(group_filter_list$`Blanks` - == as.character(blanks$V1))) + %in% as.character(blanks$V1))) expect_true(all(group_filter_list$`Coculture` - == as.character(coculture$V1))) + %in% as.character(coculture$V1))) expect_true(all(group_filter_list$`JC28 monoculture` - == as.character(jc28$V1))) + %in% as.character(jc28$V1))) expect_true(all(group_filter_list$`JC28 monoculture` - == as.character(jc28$V1))) + %in% as.character(jc28$V1))) }) test_that("apply_group_filter removes the correct ions", { diff --git a/tests/testthat/test-filters.R b/tests/testthat/test-filters.R index f873d67..c3b2066 100644 --- a/tests/testthat/test-filters.R +++ b/tests/testthat/test-filters.R @@ -22,7 +22,7 @@ test_that("filter mismatch ions wrapper works "cut_ions.csv"), col_names = c("V1"), show_col_types = FALSE) - expected_cut_ions <- as.integer(expected_cut_ions$V1) + expected_cut_ions <- as.character(expected_cut_ions$V1) expect_equal(nrow(data_mpactr$mpactr_data$get_peak_table()), nrow((data_mpactr$mpactr_data$get_peak_table()))) From c24138103dd5c76044d348f51b656d0340fb1d20 Mon Sep 17 00:00:00 2001 From: amason30 Date: Thu, 8 Aug 2024 15:06:41 -0400 Subject: [PATCH 4/5] Fixes to cli issue and lintr problems --- .Rbuildignore | 1 + .lintr | 8 ++++ R/filter_pactr-methods.R | 45 ++++++++++++---------- R/peak_table_formatter.R | 1 - R/summary-class.R | 10 +++-- tests/testthat/test-filter_pactr-class.R | 2 +- vignettes/articles/downstream_analyses.Rmd | 1 - vignettes/articles/filter.Rmd | 1 + vignettes/articles/filter2.Rmd | 1 + vignettes/articles/reference_semantics.Rmd | 1 + 10 files changed, 43 insertions(+), 28 deletions(-) create mode 100644 .lintr diff --git a/.Rbuildignore b/.Rbuildignore index b0cf256..9ea81bf 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^docs$ ^pkgdown$ ^\.github$ +^\.lintr$ diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..62a0b8f --- /dev/null +++ b/.lintr @@ -0,0 +1,8 @@ +linters: linters_with_defaults() # see vignette("lintr") +encoding: "UTF-8" +exclusions: list( + "vignettes/articles/filter.Rmd", + "vignettes/articles/filter2.Rmd", + "vignettes/articles/downstream_analyses.Rmd", + "vignettes/articles/reference_semantics.Rmd" + ) diff --git a/R/filter_pactr-methods.R b/R/filter_pactr-methods.R index 8801c14..3b47ce2 100644 --- a/R/filter_pactr-methods.R +++ b/R/filter_pactr-methods.R @@ -7,8 +7,8 @@ filter_pactr$set( max_iso_shift, merge_peaks, merge_method = NULL) { - cli::cli_alert_info("Checking {nrow(self$mpactr_data$get_peak_table())} - peaks for mispicked peaks.") + l <- nrow(self$mpactr_data$get_peak_table()) + cli::cli_alert_info("Checking {l} peaks for mispicked peaks.") ion_filter_list <- list() cut_ions <- c() # list @@ -49,13 +49,14 @@ filter_pactr$set( self$logger[["check_mismatched_peaks"]] <- ion_filter_list if (isTRUE(merge_peaks)) { - cli::cli_alert_info("Argument merge_peaks is: {merge_peaks}. - Merging mispicked peaks with method {merge_method}.") + cli::cli_alert_info(c("Argument merge_peaks is: {merge_peaks}. ", + "Merging mispicked peaks with method ", + "{merge_method}.")) private$merge_ions(ion_filter_list, merge_method) } else { - cli::cli_alert_warning("Argument merge_peaks is: {merge_peaks}. - Mispicked peaks will not be merged.") + cli::cli_alert_warning(c("Argument merge_peaks is: {merge_peaks}. ", + "Mispicked peaks will not be merged.")) } self$logger$list_of_summaries$mispicked <- summary$new( @@ -111,8 +112,8 @@ filter_pactr$set("private", "get_merged_ions", function(ringwin, filter_pactr$set("private", "merge_ions", function(ion_filter_list, method) { if (is.null(method)) { - cli::cli_abort("No method has been supplied for merging peaks. - method must be one of: sum") + cli::cli_abort(c("No method has been supplied for merging peaks. ", + "{.var method} must be one of: sum")) } if (method == "sum") { @@ -222,21 +223,22 @@ filter_pactr$set( function(group, remove_ions = TRUE) { groups <- unique(self$mpactr_data$get_meta_data()$Biological_Group) if (isFALSE(group %in% groups)) { - cli::cli_abort("{.var group} {group} is not in {.var Biological_Group}. - Options are: {groups}") + cli::cli_abort(c("{.var group} {group} is not in ", + "{.var Biological_Group}.", + "Options are: {groups}")) } - cli::cli_alert_info("Parsing {nrow(self$mpactr_data$get_peak_table())} - peaks based on the following sample group: {group}.") + l <- nrow(self$mpactr_data$get_peak_table()) + cli::cli_alert_info("Parsing {l} peaks based on the sample group: {group}.") if (isFALSE(remove_ions)) { - cli::cli_alert_warning("Argument remove_ions is {remove_ions}. - Peaks from {group} will not be removed.") + cli::cli_alert_warning(c("Argument remove_ions is {remove_ions}. ", + "Peaks from {group} will not be removed.")) return() } - cli::cli_alert_info("Argument remove_ions is: {remove_ions}. - Removing peaks from {group}.") + cli::cli_alert_info(c("Argument remove_ions is: {remove_ions}.", + "Removing peaks from {group}.")) ions <- self$logger[["group_filter-failing_list"]][[group]] self$mpactr_data$set_peak_table(self$mpactr_data$get_peak_table()[ @@ -269,14 +271,15 @@ filter_pactr$set( ## abort if there are no technical replicates. if (isFALSE(self$mpactr_data$isMultipleTechReps())) { - cli::cli_abort("There are no technical replicates in the dataset - provided. In order to run the replicability filter, - technical replicates are required.") + cli_abort(c("There are no technical replicates in the dataset provided. ", + "In order to run the replicability filter, technical ", + "replicates are required.")) } input_ions <- self$mpactr_data$get_peak_table()$Compound - cli::cli_alert_info("Parsing {length(input_ions)} peaks for - replicability across technical replicates.") + cli <- cli::cli_alert_info + n <- length(input_ions) + cli("Parsing {n} peaks for replicability across technical replicates.") cv <- data.table::melt(self$mpactr_data$get_peak_table(), id.vars = c("Compound", "mz", "rt", "kmd"), variable.name = diff --git a/R/peak_table_formatter.R b/R/peak_table_formatter.R index 3c61e0a..8b7fb9d 100644 --- a/R/peak_table_formatter.R +++ b/R/peak_table_formatter.R @@ -29,7 +29,6 @@ format_by_type <- function(peak_table_path, } # default condition = NULL } - progenesis_formatter <- function(peak_table) { peak_table <- data.table(readr::read_csv(peak_table, skip = 2, diff --git a/R/summary-class.R b/R/summary-class.R index b53a163..f0a668d 100644 --- a/R/summary-class.R +++ b/R/summary-class.R @@ -10,10 +10,12 @@ summary <- R6::R6Class("summary", private$passed_ions <- passed_ions }, summarize = function(x) { - cli::cli_alert_success("{length(private$failed_ions)} ions failed the - {private$filter} filter, - {length(private$passed_ions)} - ions remain.") + l <- length(private$failed_ions) + f <- private$filter + r <- length(private$passed_ions) + + cli::cli_alert_success(c("{l} ions failed the {f} filter, ", + "{r} ions remain.")) }, get_failed_ions = function() { return(private$failed_ions) diff --git a/tests/testthat/test-filter_pactr-class.R b/tests/testthat/test-filter_pactr-class.R index 2cd7154..8483b17 100644 --- a/tests/testthat/test-filter_pactr-class.R +++ b/tests/testthat/test-filter_pactr-class.R @@ -538,4 +538,4 @@ test_that("get_cv returns the cv filter has been applied", { cv <- filter_class$get_cv() expect_equal(class(cv), c("data.table", "data.frame")) -}) \ No newline at end of file +}) diff --git a/vignettes/articles/downstream_analyses.Rmd b/vignettes/articles/downstream_analyses.Rmd index 683af2b..4de1a67 100644 --- a/vignettes/articles/downstream_analyses.Rmd +++ b/vignettes/articles/downstream_analyses.Rmd @@ -867,4 +867,3 @@ fc2 %>% axis.text = element_text(size = 15) ) ``` - diff --git a/vignettes/articles/filter.Rmd b/vignettes/articles/filter.Rmd index 6520f5c..f07c883 100644 --- a/vignettes/articles/filter.Rmd +++ b/vignettes/articles/filter.Rmd @@ -1,3 +1,4 @@ + --- title: "Filter" --- diff --git a/vignettes/articles/filter2.Rmd b/vignettes/articles/filter2.Rmd index 5afcb5c..dabdf4e 100644 --- a/vignettes/articles/filter2.Rmd +++ b/vignettes/articles/filter2.Rmd @@ -1,3 +1,4 @@ + --- title: "Filter with mpact original data" --- diff --git a/vignettes/articles/reference_semantics.Rmd b/vignettes/articles/reference_semantics.Rmd index 931d269..33714cd 100644 --- a/vignettes/articles/reference_semantics.Rmd +++ b/vignettes/articles/reference_semantics.Rmd @@ -1,3 +1,4 @@ + --- title: "Reference Semantics" --- From 44b65f69244f8dcfc96a2258c180f2471bbcc476 Mon Sep 17 00:00:00 2001 From: amason30 Date: Thu, 8 Aug 2024 15:28:36 -0400 Subject: [PATCH 5/5] lintr and pkgdown errors fixed --- R/peak_table_formatter.R | 9 ++++----- vignettes/articles/downstream_analyses.Rmd | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/peak_table_formatter.R b/R/peak_table_formatter.R index 8b7fb9d..d579138 100644 --- a/R/peak_table_formatter.R +++ b/R/peak_table_formatter.R @@ -63,10 +63,6 @@ mz_mine_formatter <- function(peak_table) { metaboscape_formatter <- function(peak_table, sample_names) { peak_table <- data.table(readr::read_csv(peak_table, show_col_types = FALSE)) peak_table_convert <- data.table::copy(peak_table) - adduct_data <- utils::read.csv(system.file("extdata/ion_masses", - "DefinedIons.csv", - package = "mpactR" - )) peak_table_convert <- with(peak_table_convert, peak_table_convert[ , ion := gsub( ".*\\[(.+)\\].*", "\\1", @@ -81,7 +77,10 @@ metaboscape_formatter <- function(peak_table, sample_names) { ][ charge_string == "3+", charge := 3 ][ - adduct_data, + utils::read.csv(system.file("extdata/ion_masses", + "DefinedIons.csv", + package = "mpactR" + )), on = .(ion = IONS) ][ , mz := (PEPMASS / charge) + MASS diff --git a/vignettes/articles/downstream_analyses.Rmd b/vignettes/articles/downstream_analyses.Rmd index 4de1a67..0d82f4f 100644 --- a/vignettes/articles/downstream_analyses.Rmd +++ b/vignettes/articles/downstream_analyses.Rmd @@ -87,6 +87,7 @@ We can join these two `data.table`s for plotting data set: ```{r} get_raw_data(data_filtered) %>% + mutate(Compound = as.character(Compound)) %>% select(Compound, mz, rt) %>% left_join(qc_summary(data_filtered), by = join_by("Compound" == "compounds") @@ -98,6 +99,7 @@ Now we can create a scatter plot to show the input features (m/z ~ retention tim ```{r} get_raw_data(data_filtered) %>% + mutate(Compound = as.character(Compound)) %>% select(Compound, mz, rt) %>% left_join(qc_summary(data_filtered), by = join_by("Compound" == "compounds") @@ -119,6 +121,7 @@ We can also make the plot interactive with the `plotly` package function `ggplot ```{r} feature_plot <- get_raw_data(data_filtered) %>% + mutate(Compound = as.character(Compound)) %>% select(Compound, mz, rt) %>% left_join(qc_summary(data_filtered), by = join_by("Compound" == "compounds")