From 35747e92db3277d245ee744503c2a6ecb81c5f9b Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Fri, 4 Apr 2025 08:53:35 -0400 Subject: [PATCH 01/20] added weights to groupComparison --- R/groupComparison.R | 31 +++++++++++++----------- R/utils_groupcomparison.R | 51 ++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/R/groupComparison.R b/R/groupComparison.R index 1d9c7d29..891d5f7e 100644 --- a/R/groupComparison.R +++ b/R/groupComparison.R @@ -78,7 +78,7 @@ #' # table for result #' testResultOneComparison$ComparisonResult #' -groupComparison = function(contrast.matrix, data, +groupComparison = function(contrast.matrix, data, weights=NULL, save_fitted_models = TRUE, log_base = 2, use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, @@ -94,12 +94,13 @@ groupComparison = function(contrast.matrix, data, samples_info = getSamplesInfo(data) groups = unique(data$ProteinLevelData$GROUP) contrast_matrix = MSstatsContrastMatrix(contrast.matrix, groups) - getOption("MSstatsLog")("INFO", - "== Start to test and get inference in whole plot") - getOption("MSstatsMsg")("INFO", - " == Start to test and get inference in whole plot ...") + getOption("MSstatsLog")( + "INFO", "== Start to test and get inference in whole plot") + getOption("MSstatsMsg")( + "INFO", " == Start to test and get inference in whole plot ...") testing_results = MSstatsGroupComparison(split_summarized, contrast_matrix, - save_fitted_models, repeated, samples_info, + weights, save_fitted_models, + repeated, samples_info, numberOfCores) getOption("MSstatsLog")("INFO", "== Comparisons for all proteins are done.") @@ -170,17 +171,18 @@ MSstatsPrepareForGroupComparison = function(summarization_output) { #' group_comparison[[1]][[2]] # comparison result #' group_comparison[[2]][[3]] # NULL, because we set save_fitted_models to FALSE #' -MSstatsGroupComparison = function(summarized_list, contrast_matrix, +MSstatsGroupComparison = function(summarized_list, contrast_matrix, weights, save_fitted_models, repeated, samples_info, numberOfCores = 1) { if (numberOfCores > 1) { - return(.groupComparisonWithMultipleCores(summarized_list, contrast_matrix, + return(.groupComparisonWithMultipleCores(summarized_list, + contrast_matrix, weights, save_fitted_models, repeated, samples_info, numberOfCores)) } else { return(.groupComparisonWithSingleCore(summarized_list, contrast_matrix, - save_fitted_models, repeated, - samples_info)) + weights, save_fitted_models, + repeated, samples_info)) } } @@ -271,16 +273,17 @@ MSstatsGroupComparisonOutput = function(input, summarization_output, log_base = #' single_output # same as a single element of MSstatsGroupComparison output #' MSstatsGroupComparisonSingleProtein = function(single_protein, contrast_matrix, - repeated, groups, samples_info, - save_fitted_models, + weights, repeated, groups, + samples_info, save_fitted_models, has_imputed) { single_protein = .prepareSingleProteinForGC(single_protein) is_single_subject = .checkSingleSubject(single_protein) has_tech_reps = .checkTechReplicate(single_protein) fitted_model = try(.fitModelSingleProtein(single_protein, contrast_matrix, - has_tech_reps, is_single_subject, - repeated, groups, samples_info, + weights, has_tech_reps, + is_single_subject, repeated, + groups, samples_info, save_fitted_models, has_imputed), silent = TRUE) if (inherits(fitted_model, "try-error")) { diff --git a/R/utils_groupcomparison.R b/R/utils_groupcomparison.R index cf008fcd..5276c74a 100644 --- a/R/utils_groupcomparison.R +++ b/R/utils_groupcomparison.R @@ -86,9 +86,9 @@ getSamplesInfo = function(summarization_output) { #' @param has_imputed if TRUE, missing values have been imputed by dataProcess #' @importFrom stats resid fitted #' @keywords internal -.fitModelSingleProtein = function(input, contrast_matrix, has_tech_replicates, - is_single_subject, repeated, groups, - samples_info, +.fitModelSingleProtein = function(input, contrast_matrix, weights, + has_tech_replicates, is_single_subject, + repeated, groups, samples_info, save_fitted_models, has_imputed) { GROUP = SUBJECT = NULL @@ -104,7 +104,7 @@ getSamplesInfo = function(summarization_output) { fitted_values = NA fit = NULL } else { - fitted_model = .fitModelForGroupComparison(input, repeated, + fitted_model = .fitModelForGroupComparison(input, weights, repeated, is_single_subject, has_tech_replicates) result = .getAllComparisons(input, fitted_model, contrast_matrix, @@ -136,19 +136,21 @@ getSamplesInfo = function(summarization_output) { #' Choose a model type (fixed/mixed effects) and fit it for a single protein #' @inheritParams .fitModelSingleProtein #' @keywords internal -.fitModelForGroupComparison = function(input, repeated, is_single_subject, - has_tech_replicates) { +.fitModelForGroupComparison = function(input, weights, repeated, + is_single_subject, has_tech_replicates) { if (!repeated) { if (!has_tech_replicates | is_single_subject) { - full_fit = lm(ABUNDANCE ~ GROUP, data = input) + full_fit = lm(ABUNDANCE ~ GROUP, data = input, weights=weights) df_full = full_fit[["df.residual"]] } else { full_fit = suppressMessages(try( - lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input), + lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), + data = input, weights=weights), TRUE )) df_full = suppressMessages(try( - lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual, + lm(ABUNDANCE ~ GROUP + SUBJECT, + data = input, weights=weights)$df.residual, TRUE )) } @@ -156,27 +158,30 @@ getSamplesInfo = function(summarization_output) { ## time-course if (is_single_subject) { full_fit = lm(ABUNDANCE ~ GROUP, - data = input) + data = input, weights=weights) df_full = full_fit$df.residual } else { ## no single subject if (!has_tech_replicates) { full_fit = suppressMessages(try( - lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input), + lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), + data = input, weights=weights), TRUE )) df_full = suppressMessages(try( - lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual, + lm(ABUNDANCE ~ GROUP + SUBJECT, + data = input, weights=weights)$df.residual, TRUE)) } else { full_fit = suppressMessages(try( - lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT) + (1|GROUP:SUBJECT), - data = input), + lme4::lmer( + ABUNDANCE ~ GROUP + (1|SUBJECT) + (1|GROUP:SUBJECT), + data = input, weights=weights), TRUE )) df_full = suppressMessages(try( lm(ABUNDANCE ~ GROUP + SUBJECT + GROUP:SUBJECT, - data = input)$df.residual, + data = input, weights=weights)$df.residual, TRUE )) } @@ -451,7 +456,8 @@ getSamplesInfo = function(summarization_output) { #' @importFrom parallel makeCluster clusterExport parLapply stopCluster #' @keywords internal .groupComparisonWithMultipleCores = function(summarized_list, contrast_matrix, - save_fitted_models, repeated, samples_info, + weights, save_fitted_models, + repeated, samples_info, numberOfCores) { groups = colnames(contrast_matrix) has_imputed = attr(summarized_list, "has_imputed") @@ -459,8 +465,9 @@ getSamplesInfo = function(summarization_output) { function_environment = environment() cl = parallel::makeCluster(numberOfCores) parallel::clusterExport(cl, c("MSstatsGroupComparisonSingleProtein", - "contrast_matrix", "repeated", "groups", - "samples_info", "save_fitted_models", "has_imputed"), + "contrast_matrix", "weights", "repeated", + "groups", "samples_info", + "save_fitted_models", "has_imputed"), envir = function_environment) cat(paste0("Number of proteins to process: ", length(all_proteins_id)), sep = "\n", file = "MSstats_groupComparison_log_progress.log") @@ -470,7 +477,7 @@ getSamplesInfo = function(summarization_output) { sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE) } MSstatsGroupComparisonSingleProtein( - summarized_list[[i]], contrast_matrix, repeated, + summarized_list[[i]], contrast_matrix, weights, repeated, groups, samples_info, save_fitted_models, has_imputed ) }) @@ -487,8 +494,8 @@ getSamplesInfo = function(summarization_output) { #' @importFrom utils txtProgressBar setTxtProgressBar #' @keywords internal .groupComparisonWithSingleCore = function(summarized_list, contrast_matrix, - save_fitted_models, repeated, - samples_info) { + weights, save_fitted_models, + repeated, samples_info) { groups = colnames(contrast_matrix) has_imputed = attr(summarized_list, "has_imputed") all_proteins_id = seq_along(summarized_list) @@ -496,7 +503,7 @@ getSamplesInfo = function(summarization_output) { pb = txtProgressBar(max = length(all_proteins_id), style = 3) for (i in all_proteins_id) { comparison_outputs = MSstatsGroupComparisonSingleProtein( - summarized_list[[i]], contrast_matrix, repeated, + summarized_list[[i]], contrast_matrix, weights, repeated, groups, samples_info, save_fitted_models, has_imputed ) test_results[[i]] = comparison_outputs From c0a8ac64cb3f1c5584e75a1137c091a02dd163e5 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Sun, 13 Apr 2025 10:59:43 -0400 Subject: [PATCH 02/20] adding anomaly scores to dataProcess --- R/dataProcess.R | 2 +- R/utils_checks.R | 26 ++++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index dab82401..68874ff3 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -412,7 +412,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) is_single_feature = .checkSingleFeature(single_protein) fit = try(.fitLinearModel(single_protein, is_single_feature, is_labeled = label, - equal_variances), silent = TRUE) + equal_variances, weights), silent = TRUE) if (inherits(fit, "try-error")) { msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN)) diff --git a/R/utils_checks.R b/R/utils_checks.R index adfe09bb..5cbe939f 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -140,13 +140,20 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { #' @importFrom data.table uniqueN as.data.table #' @keywords internal .checkUnProcessedDataValidity = function(input, fix_missing, fill_incomplete) { + input = data.table::as.data.table(unclass(input)) + + if (!"AnomalyScores" %in% colnames(input)){ + input$AnomalyScores = NA + } + cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", - "IsotopeLabelType", "Condition", "BioReplicate", "Run", "Intensity") + "IsotopeLabelType", "Condition", "BioReplicate", "Run", + "Intensity", "AnomalyScores") provided_cols = intersect(cols, colnames(input)) - if (length(provided_cols) < 10) { + if (length(provided_cols) < 11) { msg = paste("Missing columns in the input:", paste(setdiff(cols, colnames(input)), collapse = " ")) getOption("MSstatsLog")("ERROR", msg) @@ -178,13 +185,16 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { colnames(input)) input = input[, cols, with = FALSE] - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_") - input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_") + input$PEPTIDE = paste(input$PEPTIDESEQUENCE, + input$PRECURSORCHARGE, sep = "_") + input$TRANSITION = paste(input$FRAGMENTION, + input$PRODUCTCHARGE, sep = "_") if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) { - getOption("MSstatsLog")("ERROR", - paste("There are more than two levels of labeling.", - "So far, only label-free or reference-labeled experiment are supported. - stop")) + getOption("MSstatsLog")( + "ERROR", paste( + "There are more than two levels of labeling.", + "So far, only label-free or reference-labeled experiment are supported. - stop")) stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.") } @@ -272,7 +282,7 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess) cols = c("PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE", "LABEL", "GROUP_ORIGINAL", "SUBJECT_ORIGINAL", "RUN", "GROUP", - "SUBJECT", "FRACTION", "INTENSITY") + "SUBJECT", "FRACTION", "INTENSITY", "ANOMALYSCORES") if ("TECHREPLICATE" %in% colnames(input)) { cols = unique(c(cols, "TECHREPLICATE")) } From 89c031ed2dacde6ff07d334c7146e5a035397f78 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Sun, 13 Apr 2025 14:42:36 -0400 Subject: [PATCH 03/20] anomaly weighted summarization --- NAMESPACE | 1 + R/dataProcess.R | 49 +++++++++++++++++--- R/test_script.R | 15 +++++++ R/utils_anomaly_weights.R | 50 +++++++++++++++++++++ R/utils_censored.R | 2 +- R/utils_summarization.R | 11 ++++- R/utils_summarization_prepare.R | 10 ++--- man/MSstatsGroupComparison.Rd | 1 + man/MSstatsGroupComparisonSingleProtein.Rd | 1 + man/MSstatsSummarizeSingleLinear.Rd | 8 +++- man/dot-fitModelForGroupComparison.Rd | 1 + man/dot-fitModelSingleProtein.Rd | 1 + man/dot-groupComparisonWithMultipleCores.Rd | 1 + man/dot-groupComparisonWithSingleCore.Rd | 1 + man/groupComparison.Rd | 1 + 15 files changed, 139 insertions(+), 14 deletions(-) create mode 100644 R/test_script.R create mode 100644 R/utils_anomaly_weights.R diff --git a/NAMESPACE b/NAMESPACE index 4bc5cc3a..3fe088d2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,7 @@ importFrom(parallel,clusterExport) importFrom(parallel,makeCluster) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) +importFrom(pcaMethods,pca) importFrom(plotly,add_trace) importFrom(plotly,ggplotly) importFrom(plotly,layout) diff --git a/R/dataProcess.R b/R/dataProcess.R index 68874ff3..86c49aac 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -350,8 +350,9 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3) for (protein_id in seq_len(num_proteins)) { single_protein = proteins_list[[protein_id]] - summarized_result = MSstatsSummarizeSingleLinear(single_protein, - equal_variance) + summarized_result = MSstatsSummarizeSingleLinear( + single_protein, impute, censored_symbol, remove50missing, + equal_variance) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } @@ -378,6 +379,7 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, #' @return list with protein-level data #' #' @importFrom stats xtabs +#' @importFrom pcaMethods pca #' #' @export #' @@ -398,9 +400,45 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, #' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]]) #' head(single_protein_summary[[1]]) #' -MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) { +MSstatsSummarizeSingleLinear = function(single_protein, + impute, + censored_symbol, + remove50missing, + equal_variances = TRUE) { ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL + cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN", + "FEATURE", "ref")) + single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) & + (n_obs_run > 0 & !is.na(n_obs_run))] + if (nrow(single_protein) == 0) { + return(list(NULL, NULL)) + } + single_protein[, RUN := factor(RUN)] + single_protein[, FEATURE := factor(FEATURE)] + if (impute & any(single_protein[["censored"]])) { + survival_fit = .fitSurvival(single_protein[LABEL == "L", cols, + with = FALSE]) + sigma2 = survival_fit$scale^2 + single_protein[, c("predicted", "imputation_var") := { + pred = predict(survival_fit, newdata = .SD, se.fit = TRUE) + list(pred$fit, pred$se.fit^2 + sigma2) + }] + single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)] + single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L", + predicted, newABUNDANCE)] + survival = single_protein[, c(cols, "predicted"), with = FALSE] + } else { + survival = single_protein[, cols, with = FALSE] + survival[, predicted := NA] + } + + if (all(!is.na(single_protein$ANOMALYSCORES))){ + single_protein = .calculate_weights(single_protein) + } else { + single_protein$weights = NA + } + label = data.table::uniqueN(single_protein$LABEL) > 1 single_protein = single_protein[!is.na(ABUNDANCE)] single_protein[, RUN := factor(RUN)] @@ -411,8 +449,9 @@ MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) counts = as.matrix(counts) is_single_feature = .checkSingleFeature(single_protein) - fit = try(.fitLinearModel(single_protein, is_single_feature, is_labeled = label, - equal_variances, weights), silent = TRUE) + # fit = try(, silent = TRUE) + fit = .fitLinearModel(single_protein, is_single_feature, is_labeled = label, + equal_variances) if (inherits(fit, "try-error")) { msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN)) diff --git a/R/test_script.R b/R/test_script.R new file mode 100644 index 00000000..201f912b --- /dev/null +++ b/R/test_script.R @@ -0,0 +1,15 @@ + +library(MSstats) +library(data.table) + +msstats_input = fread("C:\\Users\\Devon Kohler\\Downloads\\test_input.csv") + +msstats_input$Intensity = ifelse(msstats_input$Intensity < 2, NA, msstats_input$Intensity) + +summarized = dataProcess(msstats_input[msstats_input$ProteinName =="Q9BW30", ], + normalization=FALSE, + featureSubset = "topN", + n_top_feature = 100, + MBimpute=TRUE, + summaryMethod="linear", + numberOfCores = 1) \ No newline at end of file diff --git a/R/utils_anomaly_weights.R b/R/utils_anomaly_weights.R new file mode 100644 index 00000000..267059f8 --- /dev/null +++ b/R/utils_anomaly_weights.R @@ -0,0 +1,50 @@ + +.normalize_anomalies = function(data){ + # Step 1: Find the global minimum anomaly score + global_min = min(data$ANOMALYSCORES ) + + # Step 2: Compute the local minimum for each PSM and adjust scores + data[, local_min := min(ANOMALYSCORES ), by = PEPTIDE] + data[, normalized_score := ANOMALYSCORES - local_min + global_min] + + return(data) +} + +.calculate_weights = function(data){ + + data = .normalize_anomalies(data) + + w_stats = data[, .( + shapiro_W = tryCatch( + shapiro.test(unlist(normalized_score))$statistic, + error = function(e) NA_real_ + ) + ), by = PEPTIDE] + + # Join the W stats back into the original data.table + data = merge(data, w_stats, by = "PEPTIDE", all.x = TRUE) + + anomaly_scores = data$normalized_score + imputation_var = data$imputation_var + norm_score = data$shapiro_W + + if (sum(is.na(imputation_var)) == length(imputation_var)){ + weights = anomaly_scores + } else { + resPPCA = pca(matrix( + c(anomaly_scores, imputation_var, norm_score), ncol=3), + method="ppca", center=TRUE, nPcs=1, + seed=1, maxIterations = 10000) + weights = (abs(resPPCA@loadings[[1]]) * anomaly_scores + ) + + (abs(resPPCA@loadings[[3]]) * norm_score + ) + ifelse( + is.na(imputation_var), 0, + abs(resPPCA@loadings[[2]]) * imputation_var) + } + + data$raw_weights = weights + data$weights = 1 / weights + + return(data) +} \ No newline at end of file diff --git a/R/utils_censored.R b/R/utils_censored.R index bbe4c617..557479a3 100644 --- a/R/utils_censored.R +++ b/R/utils_censored.R @@ -29,7 +29,7 @@ MSstatsHandleMissing = function(input, summary_method, impute, missing_symbol, censored_cutoff) { INTENSITY = LABEL = ABUNDANCE = censored = NULL - if ((summary_method == "TMP" & impute) & !is.null(missing_symbol)) { + if (impute & !is.null(missing_symbol)) { input$censored = FALSE ## if intensity = 1, but abundance > cutoff after normalization, it also should be censored. if (!is.null(censored_cutoff)) { diff --git a/R/utils_summarization.R b/R/utils_summarization.R index ec7433d9..b8880b88 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -156,11 +156,18 @@ #' @keywords internal .fitLinearModel = function(input, is_single_feature, is_labeled, equal_variances) { + if (all(!is.na(input$weights))){ + weights = input$weights + } else { + weights = NULL + } + if (!is_labeled) { if (is_single_feature) { - linear_model = lm(ABUNDANCE ~ RUN , data = input) + linear_model = lm(ABUNDANCE ~ RUN , data = input, weights=weights) } else { - linear_model = lm(ABUNDANCE ~ FEATURE + RUN , data = input) + linear_model = lm(ABUNDANCE ~ FEATURE + RUN , + data = input, weights=weights) } } else { if (is_single_feature) { diff --git a/R/utils_summarization_prepare.R b/R/utils_summarization_prepare.R index 251055da..a6cfc5b3 100644 --- a/R/utils_summarization_prepare.R +++ b/R/utils_summarization_prepare.R @@ -103,11 +103,11 @@ getProcessed = function(input) { #' @return data.table #' @keywords internal .prepareSummary = function(input, method, impute, censored_symbol) { - if (method == "TMP") { - input = .prepareTMP(input, impute, censored_symbol) - } else { - input = .prepareLinear(input, FALSE, censored_symbol) - } + # if (method == "TMP") { + input = .prepareTMP(input, impute, censored_symbol) + # } else { + # input = .prepareLinear(input, FALSE, censored_symbol) + # } input } diff --git a/man/MSstatsGroupComparison.Rd b/man/MSstatsGroupComparison.Rd index 0b48d455..9e372472 100644 --- a/man/MSstatsGroupComparison.Rd +++ b/man/MSstatsGroupComparison.Rd @@ -7,6 +7,7 @@ MSstatsGroupComparison( summarized_list, contrast_matrix, + weights, save_fitted_models, repeated, samples_info, diff --git a/man/MSstatsGroupComparisonSingleProtein.Rd b/man/MSstatsGroupComparisonSingleProtein.Rd index fa279ba1..473a28a9 100644 --- a/man/MSstatsGroupComparisonSingleProtein.Rd +++ b/man/MSstatsGroupComparisonSingleProtein.Rd @@ -7,6 +7,7 @@ MSstatsGroupComparisonSingleProtein( single_protein, contrast_matrix, + weights, repeated, groups, samples_info, diff --git a/man/MSstatsSummarizeSingleLinear.Rd b/man/MSstatsSummarizeSingleLinear.Rd index 82dde780..cdf3cb69 100644 --- a/man/MSstatsSummarizeSingleLinear.Rd +++ b/man/MSstatsSummarizeSingleLinear.Rd @@ -4,7 +4,13 @@ \alias{MSstatsSummarizeSingleLinear} \title{Linear model-based summarization for a single protein} \usage{ -MSstatsSummarizeSingleLinear(single_protein, equal_variances = TRUE) +MSstatsSummarizeSingleLinear( + single_protein, + impute, + censored_symbol, + remove50missing, + equal_variances = TRUE +) } \arguments{ \item{single_protein}{feature-level data for a single protein} diff --git a/man/dot-fitModelForGroupComparison.Rd b/man/dot-fitModelForGroupComparison.Rd index e6cc7d9f..fc077940 100644 --- a/man/dot-fitModelForGroupComparison.Rd +++ b/man/dot-fitModelForGroupComparison.Rd @@ -6,6 +6,7 @@ \usage{ .fitModelForGroupComparison( input, + weights, repeated, is_single_subject, has_tech_replicates diff --git a/man/dot-fitModelSingleProtein.Rd b/man/dot-fitModelSingleProtein.Rd index e5e19945..0883dc3f 100644 --- a/man/dot-fitModelSingleProtein.Rd +++ b/man/dot-fitModelSingleProtein.Rd @@ -7,6 +7,7 @@ .fitModelSingleProtein( input, contrast_matrix, + weights, has_tech_replicates, is_single_subject, repeated, diff --git a/man/dot-groupComparisonWithMultipleCores.Rd b/man/dot-groupComparisonWithMultipleCores.Rd index f4ae9ea1..1753c074 100644 --- a/man/dot-groupComparisonWithMultipleCores.Rd +++ b/man/dot-groupComparisonWithMultipleCores.Rd @@ -7,6 +7,7 @@ .groupComparisonWithMultipleCores( summarized_list, contrast_matrix, + weights, save_fitted_models, repeated, samples_info, diff --git a/man/dot-groupComparisonWithSingleCore.Rd b/man/dot-groupComparisonWithSingleCore.Rd index f3d15e71..481722db 100644 --- a/man/dot-groupComparisonWithSingleCore.Rd +++ b/man/dot-groupComparisonWithSingleCore.Rd @@ -7,6 +7,7 @@ .groupComparisonWithSingleCore( summarized_list, contrast_matrix, + weights, save_fitted_models, repeated, samples_info diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd index a0ffdfb1..d41e06e6 100644 --- a/man/groupComparison.Rd +++ b/man/groupComparison.Rd @@ -7,6 +7,7 @@ groupComparison( contrast.matrix, data, + weights = NULL, save_fitted_models = TRUE, log_base = 2, use_log_file = TRUE, From 1574fd93fbdb4b38e224b564e55fa1389d7da479 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 15 Apr 2025 12:23:30 -0400 Subject: [PATCH 04/20] anomaly scores into summaraization and model --- R/RcppExports.R | 4 ++-- R/dataProcess.R | 17 +++++++++++------ R/groupComparison.R | 14 +++++++------- R/test_script.R | 15 --------------- R/utils_anomaly_weights.R | 20 ++++++++++++-------- R/utils_groupcomparison.R | 37 ++++++++++++++++++++++--------------- R/utils_output.R | 10 +++++----- R/utils_summarization.R | 11 ++++++----- src/RcppExports.cpp | 9 +++++---- src/linear_summary.cpp | 21 ++++++++++++++++++--- 10 files changed, 88 insertions(+), 70 deletions(-) delete mode 100644 R/test_script.R diff --git a/R/RcppExports.R b/R/RcppExports.R index 507c5628..65f2c4fb 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,8 +9,8 @@ make_contrast_run_quant <- function(input, coefs, contrast_matrix, counts, is_la .Call(`_MSstats_make_contrast_run_quant`, input, coefs, contrast_matrix, counts, is_labeled, is_reference) } -get_linear_summary <- function(input, coefs, counts, is_labeled) { - .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled) +get_linear_summary <- function(input, coefs, counts, is_labeled, cov_mat) { + .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled, cov_mat) } median_polish_summary <- function(x, eps = 0.01, maxiter = 10L) { diff --git a/R/dataProcess.R b/R/dataProcess.R index 86c49aac..8cdc41f7 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -424,7 +424,8 @@ MSstatsSummarizeSingleLinear = function(single_protein, pred = predict(survival_fit, newdata = .SD, se.fit = TRUE) list(pred$fit, pred$se.fit^2 + sigma2) }] - single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)] + single_protein[, predicted := ifelse(censored & (LABEL == "L"), + predicted, NA)] single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L", predicted, newABUNDANCE)] survival = single_protein[, c(cols, "predicted"), with = FALSE] @@ -440,7 +441,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, } label = data.table::uniqueN(single_protein$LABEL) > 1 - single_protein = single_protein[!is.na(ABUNDANCE)] + single_protein = single_protein[!is.na(newABUNDANCE)] single_protein[, RUN := factor(RUN)] single_protein[, FEATURE := factor(FEATURE)] @@ -460,12 +461,16 @@ MSstatsSummarizeSingleLinear = function(single_protein, result = NULL } else { cf = summary(fit)$coefficients[, 1] + cov_mat = vcov(fit) + result = unique(single_protein[, list(Protein = PROTEIN, RUN = RUN)]) - log_intensities = get_linear_summary(single_protein, cf, - counts, label) - result[, LogIntensities := log_intensities] + extracted_values = get_linear_summary(single_protein, cf, + counts, label, cov_mat) + # extracted_values = get_run_estimates_simple(fit, single_protein, counts) + + result = cbind(result, extracted_values) } - list(result) + list(result, survival) } diff --git a/R/groupComparison.R b/R/groupComparison.R index 891d5f7e..39a80a26 100644 --- a/R/groupComparison.R +++ b/R/groupComparison.R @@ -78,7 +78,7 @@ #' # table for result #' testResultOneComparison$ComparisonResult #' -groupComparison = function(contrast.matrix, data, weights=NULL, +groupComparison = function(contrast.matrix, data, save_fitted_models = TRUE, log_base = 2, use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, @@ -99,7 +99,7 @@ groupComparison = function(contrast.matrix, data, weights=NULL, getOption("MSstatsMsg")( "INFO", " == Start to test and get inference in whole plot ...") testing_results = MSstatsGroupComparison(split_summarized, contrast_matrix, - weights, save_fitted_models, + save_fitted_models, repeated, samples_info, numberOfCores) getOption("MSstatsLog")("INFO", @@ -171,17 +171,17 @@ MSstatsPrepareForGroupComparison = function(summarization_output) { #' group_comparison[[1]][[2]] # comparison result #' group_comparison[[2]][[3]] # NULL, because we set save_fitted_models to FALSE #' -MSstatsGroupComparison = function(summarized_list, contrast_matrix, weights, +MSstatsGroupComparison = function(summarized_list, contrast_matrix, save_fitted_models, repeated, samples_info, numberOfCores = 1) { if (numberOfCores > 1) { return(.groupComparisonWithMultipleCores(summarized_list, - contrast_matrix, weights, + contrast_matrix, save_fitted_models, repeated, samples_info, numberOfCores)) } else { return(.groupComparisonWithSingleCore(summarized_list, contrast_matrix, - weights, save_fitted_models, + save_fitted_models, repeated, samples_info)) } } @@ -273,7 +273,7 @@ MSstatsGroupComparisonOutput = function(input, summarization_output, log_base = #' single_output # same as a single element of MSstatsGroupComparison output #' MSstatsGroupComparisonSingleProtein = function(single_protein, contrast_matrix, - weights, repeated, groups, + repeated, groups, samples_info, save_fitted_models, has_imputed) { single_protein = .prepareSingleProteinForGC(single_protein) @@ -281,7 +281,7 @@ MSstatsGroupComparisonSingleProtein = function(single_protein, contrast_matrix, has_tech_reps = .checkTechReplicate(single_protein) fitted_model = try(.fitModelSingleProtein(single_protein, contrast_matrix, - weights, has_tech_reps, + has_tech_reps, is_single_subject, repeated, groups, samples_info, save_fitted_models, has_imputed), diff --git a/R/test_script.R b/R/test_script.R deleted file mode 100644 index 201f912b..00000000 --- a/R/test_script.R +++ /dev/null @@ -1,15 +0,0 @@ - -library(MSstats) -library(data.table) - -msstats_input = fread("C:\\Users\\Devon Kohler\\Downloads\\test_input.csv") - -msstats_input$Intensity = ifelse(msstats_input$Intensity < 2, NA, msstats_input$Intensity) - -summarized = dataProcess(msstats_input[msstats_input$ProteinName =="Q9BW30", ], - normalization=FALSE, - featureSubset = "topN", - n_top_feature = 100, - MBimpute=TRUE, - summaryMethod="linear", - numberOfCores = 1) \ No newline at end of file diff --git a/R/utils_anomaly_weights.R b/R/utils_anomaly_weights.R index 267059f8..f02c55b7 100644 --- a/R/utils_anomaly_weights.R +++ b/R/utils_anomaly_weights.R @@ -1,10 +1,10 @@ .normalize_anomalies = function(data){ # Step 1: Find the global minimum anomaly score - global_min = min(data$ANOMALYSCORES ) + global_min = min(data$ANOMALYSCORES) # Step 2: Compute the local minimum for each PSM and adjust scores - data[, local_min := min(ANOMALYSCORES ), by = PEPTIDE] + data[, local_min := min(ANOMALYSCORES), by = PEPTIDE] data[, normalized_score := ANOMALYSCORES - local_min + global_min] return(data) @@ -25,22 +25,26 @@ data = merge(data, w_stats, by = "PEPTIDE", all.x = TRUE) anomaly_scores = data$normalized_score - imputation_var = data$imputation_var + imputation_var = ifelse(data$censored, data$imputation_var, NA) norm_score = data$shapiro_W if (sum(is.na(imputation_var)) == length(imputation_var)){ weights = anomaly_scores + # resPPCA = pca(matrix( + # c(anomaly_scores, norm_score), ncol=2), + # method="ppca", center=TRUE, nPcs=1, + # seed=1, maxIterations = 10000) + # weights = (abs(resPPCA@loadings[[1]]) * anomaly_scores + # ) + (abs(resPPCA@loadings[[2]]) * norm_score) } else { resPPCA = pca(matrix( - c(anomaly_scores, imputation_var, norm_score), ncol=3), + c(anomaly_scores, imputation_var), ncol=2), #, norm_score method="ppca", center=TRUE, nPcs=1, seed=1, maxIterations = 10000) weights = (abs(resPPCA@loadings[[1]]) * anomaly_scores - ) + - (abs(resPPCA@loadings[[3]]) * norm_score - ) + ifelse( + ) + ifelse( is.na(imputation_var), 0, - abs(resPPCA@loadings[[2]]) * imputation_var) + abs(resPPCA@loadings[[2]]) * imputation_var)# + (abs(resPPCA@loadings[[3]]) * norm_score } data$raw_weights = weights diff --git a/R/utils_groupcomparison.R b/R/utils_groupcomparison.R index 5276c74a..62a9dde5 100644 --- a/R/utils_groupcomparison.R +++ b/R/utils_groupcomparison.R @@ -86,7 +86,7 @@ getSamplesInfo = function(summarization_output) { #' @param has_imputed if TRUE, missing values have been imputed by dataProcess #' @importFrom stats resid fitted #' @keywords internal -.fitModelSingleProtein = function(input, contrast_matrix, weights, +.fitModelSingleProtein = function(input, contrast_matrix, has_tech_replicates, is_single_subject, repeated, groups, samples_info, save_fitted_models, has_imputed) { @@ -104,7 +104,7 @@ getSamplesInfo = function(summarization_output) { fitted_values = NA fit = NULL } else { - fitted_model = .fitModelForGroupComparison(input, weights, repeated, + fitted_model = .fitModelForGroupComparison(input, repeated, is_single_subject, has_tech_replicates) result = .getAllComparisons(input, fitted_model, contrast_matrix, @@ -136,21 +136,28 @@ getSamplesInfo = function(summarization_output) { #' Choose a model type (fixed/mixed effects) and fit it for a single protein #' @inheritParams .fitModelSingleProtein #' @keywords internal -.fitModelForGroupComparison = function(input, weights, repeated, +.fitModelForGroupComparison = function(input, repeated, is_single_subject, has_tech_replicates) { + + if ("Variance" %in% colnames(input) & !all(is.na(input$Variance))){ + weight_input = 1/input$Variance + } else { + weight_input = NULL + } + if (!repeated) { if (!has_tech_replicates | is_single_subject) { - full_fit = lm(ABUNDANCE ~ GROUP, data = input, weights=weights) + full_fit = lm(ABUNDANCE ~ GROUP, data = input, weights=weight_input) df_full = full_fit[["df.residual"]] } else { full_fit = suppressMessages(try( lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), - data = input, weights=weights), + data = input, weights=weight_input), TRUE )) df_full = suppressMessages(try( lm(ABUNDANCE ~ GROUP + SUBJECT, - data = input, weights=weights)$df.residual, + data = input, weights=weight_input)$df.residual, TRUE )) } @@ -158,14 +165,14 @@ getSamplesInfo = function(summarization_output) { ## time-course if (is_single_subject) { full_fit = lm(ABUNDANCE ~ GROUP, - data = input, weights=weights) + data = input, weights=weight_input) df_full = full_fit$df.residual } else { ## no single subject if (!has_tech_replicates) { full_fit = suppressMessages(try( lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), - data = input, weights=weights), + data = input, weights=weight_input), TRUE )) df_full = suppressMessages(try( @@ -176,12 +183,12 @@ getSamplesInfo = function(summarization_output) { full_fit = suppressMessages(try( lme4::lmer( ABUNDANCE ~ GROUP + (1|SUBJECT) + (1|GROUP:SUBJECT), - data = input, weights=weights), + data = input, weights=weight_input), TRUE )) df_full = suppressMessages(try( lm(ABUNDANCE ~ GROUP + SUBJECT + GROUP:SUBJECT, - data = input, weights=weights)$df.residual, + data = input, weights=weight_input)$df.residual, TRUE )) } @@ -456,7 +463,7 @@ getSamplesInfo = function(summarization_output) { #' @importFrom parallel makeCluster clusterExport parLapply stopCluster #' @keywords internal .groupComparisonWithMultipleCores = function(summarized_list, contrast_matrix, - weights, save_fitted_models, + save_fitted_models, repeated, samples_info, numberOfCores) { groups = colnames(contrast_matrix) @@ -465,7 +472,7 @@ getSamplesInfo = function(summarization_output) { function_environment = environment() cl = parallel::makeCluster(numberOfCores) parallel::clusterExport(cl, c("MSstatsGroupComparisonSingleProtein", - "contrast_matrix", "weights", "repeated", + "contrast_matrix", "repeated", "groups", "samples_info", "save_fitted_models", "has_imputed"), envir = function_environment) @@ -477,7 +484,7 @@ getSamplesInfo = function(summarization_output) { sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE) } MSstatsGroupComparisonSingleProtein( - summarized_list[[i]], contrast_matrix, weights, repeated, + summarized_list[[i]], contrast_matrix, repeated, groups, samples_info, save_fitted_models, has_imputed ) }) @@ -494,7 +501,7 @@ getSamplesInfo = function(summarization_output) { #' @importFrom utils txtProgressBar setTxtProgressBar #' @keywords internal .groupComparisonWithSingleCore = function(summarized_list, contrast_matrix, - weights, save_fitted_models, + save_fitted_models, repeated, samples_info) { groups = colnames(contrast_matrix) has_imputed = attr(summarized_list, "has_imputed") @@ -503,7 +510,7 @@ getSamplesInfo = function(summarization_output) { pb = txtProgressBar(max = length(all_proteins_id), style = 3) for (i in all_proteins_id) { comparison_outputs = MSstatsGroupComparisonSingleProtein( - summarized_list[[i]], contrast_matrix, weights, repeated, + summarized_list[[i]], contrast_matrix, repeated, groups, samples_info, save_fitted_models, has_imputed ) test_results[[i]] = comparison_outputs diff --git a/R/utils_output.R b/R/utils_output.R index 521fdfb1..2a185155 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -110,11 +110,11 @@ MSstatsSummarizationOutput = function(input, summarized, processed, #' @param censored_symbol censored missing value indicator #' @keywords internal .finalizeInput = function(input, summarized, method, impute, censored_symbol) { - if (method == "TMP") { - input = .finalizeTMP(input, censored_symbol, impute, summarized) - } else { - input = .finalizeLinear(input, censored_symbol) - } + # if (method == "TMP") { + input = .finalizeTMP(input, censored_symbol, impute, summarized) + # } else { + # input = .finalizeLinear(input, censored_symbol) + # } input } diff --git a/R/utils_summarization.R b/R/utils_summarization.R index b8880b88..006bf470 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -157,17 +157,18 @@ .fitLinearModel = function(input, is_single_feature, is_labeled, equal_variances) { if (all(!is.na(input$weights))){ - weights = input$weights + weight_input = input$weights } else { - weights = NULL + weight_input = NULL } if (!is_labeled) { if (is_single_feature) { - linear_model = lm(ABUNDANCE ~ RUN , data = input, weights=weights) + linear_model = lm(newABUNDANCE ~ RUN , data = input, + weights=weight_input) } else { - linear_model = lm(ABUNDANCE ~ FEATURE + RUN , - data = input, weights=weights) + linear_model = lm(newABUNDANCE ~ FEATURE + RUN, + data = input, weights=weight_input) } } else { if (is_single_feature) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0fd08826..5275017b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -40,8 +40,8 @@ BEGIN_RCPP END_RCPP } // get_linear_summary -NumericVector get_linear_summary(const DataFrame& input, const NumericVector& coefs, const NumericMatrix& counts, const bool is_labeled); -RcppExport SEXP _MSstats_get_linear_summary(SEXP inputSEXP, SEXP coefsSEXP, SEXP countsSEXP, SEXP is_labeledSEXP) { +DataFrame get_linear_summary(const DataFrame& input, const NumericVector& coefs, const NumericMatrix& counts, const bool is_labeled, const NumericMatrix& cov_mat); +RcppExport SEXP _MSstats_get_linear_summary(SEXP inputSEXP, SEXP coefsSEXP, SEXP countsSEXP, SEXP is_labeledSEXP, SEXP cov_matSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -49,7 +49,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const NumericVector& >::type coefs(coefsSEXP); Rcpp::traits::input_parameter< const NumericMatrix& >::type counts(countsSEXP); Rcpp::traits::input_parameter< const bool >::type is_labeled(is_labeledSEXP); - rcpp_result_gen = Rcpp::wrap(get_linear_summary(input, coefs, counts, is_labeled)); + Rcpp::traits::input_parameter< const NumericMatrix& >::type cov_mat(cov_matSEXP); + rcpp_result_gen = Rcpp::wrap(get_linear_summary(input, coefs, counts, is_labeled, cov_mat)); return rcpp_result_gen; END_RCPP } @@ -70,7 +71,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_MSstats_get_estimable_fixed_random", (DL_FUNC) &_MSstats_get_estimable_fixed_random, 2}, {"_MSstats_make_contrast_run_quant", (DL_FUNC) &_MSstats_make_contrast_run_quant, 6}, - {"_MSstats_get_linear_summary", (DL_FUNC) &_MSstats_get_linear_summary, 4}, + {"_MSstats_get_linear_summary", (DL_FUNC) &_MSstats_get_linear_summary, 5}, {"_MSstats_median_polish_summary", (DL_FUNC) &_MSstats_median_polish_summary, 3}, {NULL, NULL, 0} }; diff --git a/src/linear_summary.cpp b/src/linear_summary.cpp index 6b49ba23..4eab9aec 100644 --- a/src/linear_summary.cpp +++ b/src/linear_summary.cpp @@ -103,6 +103,13 @@ double get_quant(const NumericVector& coefs, const NumericVector& contrast) { return (result(0, 0)); } +double get_quant_var(const NumericVector& contrast, + const NumericMatrix& cov_mat) { + arma::colvec c = as(contrast); + arma::mat sigma = as(cov_mat); + double var = arma::as_scalar(c.t() * sigma * c); + return var; +} NumericVector combine_contrast(bool is_reference, NumericVector intercept, NumericVector features, NumericVector runs, @@ -159,16 +166,19 @@ NumericVector make_contrast_run_quant(DataFrame input, // [[Rcpp::export]] -NumericVector get_linear_summary(const DataFrame& input, +DataFrame get_linear_summary(const DataFrame& input, const NumericVector& coefs, const NumericMatrix& counts, - const bool is_labeled) { + const bool is_labeled, + const NumericMatrix& cov_mat) { CharacterVector runs = input["RUN"]; CharacterVector unique_runs = unique(runs); CharacterVector ref = {"ref"}; unique_runs = setdiff(unique_runs, ref); int num_runs = unique_runs.length(); + NumericVector log_intensities(num_runs); + NumericVector variances(num_runs); for (int run_id = 0; run_id < num_runs; ++run_id) { NumericVector contrast_matrix(num_runs); @@ -180,6 +190,7 @@ NumericVector get_linear_summary(const DataFrame& input, is_labeled); double quantified = get_quant(coefs, contrast); log_intensities(run_id) = quantified; + variances[run_id] = get_quant_var(contrast, cov_mat); } if (is_labeled) { @@ -189,7 +200,11 @@ NumericVector get_linear_summary(const DataFrame& input, contrast_matrix, counts, true, true); log_intensities.push_back(get_quant(coefs, contrast)); + variances.push_back(get_quant_var(contrast, cov_mat)); } - return(log_intensities); + return DataFrame::create( + Named("LogIntensities") = log_intensities, + Named("Variance") = variances + ); } From bb7ca0158826fc995fde59ce67e8decb2e52cdd5 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 29 Apr 2025 17:47:25 -0400 Subject: [PATCH 05/20] fixed anomaly bug --- R/utils_checks.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils_checks.R b/R/utils_checks.R index 5cbe939f..6cd55e08 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -165,7 +165,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { balanced_cols = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") input = MSstatsConvert::MSstatsBalancedDesign( - input, balanced_cols, TRUE, TRUE, fix_missing) + input, balanced_cols, TRUE, TRUE, fix_missing, + anomaly_metrics = c("ANOMALYSCORES")) input = data.table::as.data.table(unclass(input)) data.table::setnames(input, colnames(input), toupper(colnames(input))) From 56dd6208ff391d843133970fd5c40fee851a8b6f Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Fri, 9 May 2025 10:23:39 -0400 Subject: [PATCH 06/20] imputation convergence bug --- R/dataProcess.R | 78 +++++++++++++++------ R/utils_imputation.R | 14 ++-- man/MSstatsGroupComparison.Rd | 1 - man/MSstatsGroupComparisonSingleProtein.Rd | 1 - man/MSstatsSummarize.Rd | 1 + man/MSstatsSummarizeSingleLinear.Rd | 1 + man/MSstatsSummarizeSingleTMP.Rd | 3 +- man/MSstatsSummarizeWithMultipleCores.Rd | 3 +- man/MSstatsSummarizeWithSingleCore.Rd | 3 +- man/dataProcess.Rd | 5 +- man/dot-fitModelForGroupComparison.Rd | 1 - man/dot-fitModelSingleProtein.Rd | 1 - man/dot-groupComparisonWithMultipleCores.Rd | 1 - man/dot-groupComparisonWithSingleCore.Rd | 1 - man/groupComparison.Rd | 1 - 15 files changed, 74 insertions(+), 41 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 8cdc41f7..b08e472c 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -55,6 +55,7 @@ #' @param numberOfCores Number of cores for parallel processing. When > 1, #' a logfile named `MSstats_dataProcess_log_progress.log` is created to #' track progress. Only works for Linux & Mac OS. Default is 1. +#' @param aft_iterations Number of iterations for AFT model fitting. Default is 90. #' @inheritParams .documentFunction #' #' @importFrom utils sessionInfo @@ -123,7 +124,7 @@ dataProcess = function( equalFeatureVar = TRUE, censoredInt = "NA", MBimpute = TRUE, remove50missing = FALSE, fix_missing = NULL, maxQuantileforCensored = 0.999, use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, - numberOfCores = 1 + numberOfCores = 1, aft_iterations=90 ) { MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path, @@ -156,7 +157,7 @@ dataProcess = function( summarized = tryCatch(MSstatsSummarizeWithMultipleCores(input, summaryMethod, MBimpute, censoredInt, remove50missing, equalFeatureVar, - numberOfCores), + numberOfCores, aft_iterations), error = function(e) { print(e) NULL @@ -200,7 +201,8 @@ dataProcess = function( #' @return list of length one with run-level data. #' MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol, - remove50missing, equal_variance, numberOfCores = 1) { + remove50missing, equal_variance, numberOfCores = 1, + aft_iterations) { if (numberOfCores > 1) { protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) num_proteins = length(protein_indices) @@ -224,7 +226,8 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym } single_protein = input[protein_indices[[i]],] MSstatsSummarizeSingleTMP( - single_protein, impute, censored_symbol, remove50missing) + single_protein, impute, censored_symbol, remove50missing, + aft_iterations) }) } else { summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) { @@ -233,14 +236,18 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE) } single_protein = input[protein_indices[[i]],] - MSstatsSummarizeSingleLinear(single_protein, equal_variance) + MSstatsSummarizeSingleLinear( + single_protein, + aft_iterations=aft_iterations, + equal_variance=equal_variance) }) } parallel::stopCluster(cl) return(summarized_results) } else { - return(MSstatsSummarizeWithSingleCore(input, method, impute, censored_symbol, - remove50missing, equal_variance)) + return(MSstatsSummarizeWithSingleCore(input, method, impute, + censored_symbol, remove50missing, + equal_variance, aft_iterations)) } } @@ -273,7 +280,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym #' head(summarized[[1]][[1]]) # run-level summary #' MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol, - remove50missing, equal_variance) { + remove50missing, equal_variance, aft_iterations) { protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) @@ -284,7 +291,8 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol for (protein_id in seq_len(num_proteins)) { single_protein = input[protein_indices[[protein_id]],] summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP( - single_protein, impute, censored_symbol, remove50missing) + single_protein, impute, censored_symbol, remove50missing, + aft_iterations) setTxtProgressBar(pb, protein_id) } close(pb) @@ -292,8 +300,10 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3) for (protein_id in seq_len(num_proteins)) { single_protein = input[protein_indices[[protein_id]],] - summarized_result = MSstatsSummarizeSingleLinear(single_protein, - equal_variance) + summarized_result = MSstatsSummarizeSingleLinear( + single_protein, + aft_iterations=aft_iterations, + equal_variance=equal_variance) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } @@ -334,7 +344,7 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol #' head(summarized[[1]][[1]]) # run-level summary #' MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, - remove50missing, equal_variance) { + remove50missing, aft_iterations, equal_variance) { num_proteins = length(proteins_list) summarized_results = vector("list", num_proteins) if (method == "TMP") { @@ -342,7 +352,8 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, for (protein_id in seq_len(num_proteins)) { single_protein = proteins_list[[protein_id]] summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP( - single_protein, impute, censored_symbol, remove50missing) + single_protein, impute, censored_symbol, remove50missing, + aft_iterations) setTxtProgressBar(pb, protein_id) } close(pb) @@ -352,7 +363,7 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, single_protein = proteins_list[[protein_id]] summarized_result = MSstatsSummarizeSingleLinear( single_protein, impute, censored_symbol, remove50missing, - equal_variance) + aft_iterations, equal_variance) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } @@ -404,6 +415,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, impute, censored_symbol, remove50missing, + aft_iterations, equal_variances = TRUE) { ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL @@ -418,7 +430,8 @@ MSstatsSummarizeSingleLinear = function(single_protein, single_protein[, FEATURE := factor(FEATURE)] if (impute & any(single_protein[["censored"]])) { survival_fit = .fitSurvival(single_protein[LABEL == "L", cols, - with = FALSE]) + with = FALSE], + aft_iterations) sigma2 = survival_fit$scale^2 single_protein[, c("predicted", "imputation_var") := { pred = predict(survival_fit, newdata = .SD, se.fit = TRUE) @@ -451,11 +464,12 @@ MSstatsSummarizeSingleLinear = function(single_protein, is_single_feature = .checkSingleFeature(single_protein) # fit = try(, silent = TRUE) - fit = .fitLinearModel(single_protein, is_single_feature, is_labeled = label, - equal_variances) + fit = .fitLinearModel(single_protein, is_single_feature, + is_labeled = label, equal_variances) if (inherits(fit, "try-error")) { - msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN)) + msg = paste("*** error : can't fit the model for ", + unique(single_protein$PROTEIN)) getOption("MSstatsLog")("WARN", msg) getOption("MSstatsMsg")("WARN", msg) result = NULL @@ -505,7 +519,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, #' head(single_protein_summary[[1]]) #' MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, - remove50missing) { + remove50missing, aft_iterations) { newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL predicted = censored = NULL cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN", @@ -518,10 +532,28 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, single_protein[, RUN := factor(RUN)] single_protein[, FEATURE := factor(FEATURE)] if (impute & any(single_protein[["censored"]])) { - survival_fit = .fitSurvival(single_protein[LABEL == "L", cols, - with = FALSE]) - single_protein[, predicted := predict(survival_fit, - newdata = .SD)] + + # Flag to track convergence warning + converged = TRUE + + # Try to fit survival model and catch convergence warnings + survival_fit = withCallingHandlers({ + .fitSurvival(single_protein[LABEL == "L", cols, with = FALSE], + aft_iterations) + }, warning = function(w) { + if (grepl("converge", conditionMessage(w), ignore.case = TRUE)) { + message("Convergence warning caught: ", conditionMessage(w)) + converged <<- FALSE + } + }) + + print(converged) + if (converged) { + single_protein[, predicted := predict(survival_fit, newdata = .SD)] + } else { + single_protein[, predicted := NA_real_] + } + single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)] single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L", predicted, newABUNDANCE)] diff --git a/R/utils_imputation.R b/R/utils_imputation.R index 16f9cd53..76af9c69 100644 --- a/R/utils_imputation.R +++ b/R/utils_imputation.R @@ -1,7 +1,7 @@ #' @importFrom data.table uniqueN #' @importFrom survival survreg Surv #' @keywords internal -.fitSurvival = function(input) { +.fitSurvival = function(input, aft_iterations) { FEATURE = RUN = NULL missingness_filter = is.finite(input$newABUNDANCE) @@ -19,32 +19,32 @@ # need to check fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ RUN + ref, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } else { if (countdf) { fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ RUN + ref, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } else { fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ FEATURE + RUN + ref, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } } } else { if (n_features == 1L) { fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } else { if (countdf) { fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } else { fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN, data = input, dist = "gaussian", - control = list(maxiter=90)) + control = list(maxiter=aft_iterations)) } } } diff --git a/man/MSstatsGroupComparison.Rd b/man/MSstatsGroupComparison.Rd index 9e372472..0b48d455 100644 --- a/man/MSstatsGroupComparison.Rd +++ b/man/MSstatsGroupComparison.Rd @@ -7,7 +7,6 @@ MSstatsGroupComparison( summarized_list, contrast_matrix, - weights, save_fitted_models, repeated, samples_info, diff --git a/man/MSstatsGroupComparisonSingleProtein.Rd b/man/MSstatsGroupComparisonSingleProtein.Rd index 473a28a9..fa279ba1 100644 --- a/man/MSstatsGroupComparisonSingleProtein.Rd +++ b/man/MSstatsGroupComparisonSingleProtein.Rd @@ -7,7 +7,6 @@ MSstatsGroupComparisonSingleProtein( single_protein, contrast_matrix, - weights, repeated, groups, samples_info, diff --git a/man/MSstatsSummarize.Rd b/man/MSstatsSummarize.Rd index b60f5251..daed4762 100644 --- a/man/MSstatsSummarize.Rd +++ b/man/MSstatsSummarize.Rd @@ -10,6 +10,7 @@ MSstatsSummarize( impute, censored_symbol, remove50missing, + aft_iterations, equal_variance ) } diff --git a/man/MSstatsSummarizeSingleLinear.Rd b/man/MSstatsSummarizeSingleLinear.Rd index cdf3cb69..f30bca6b 100644 --- a/man/MSstatsSummarizeSingleLinear.Rd +++ b/man/MSstatsSummarizeSingleLinear.Rd @@ -9,6 +9,7 @@ MSstatsSummarizeSingleLinear( impute, censored_symbol, remove50missing, + aft_iterations, equal_variances = TRUE ) } diff --git a/man/MSstatsSummarizeSingleTMP.Rd b/man/MSstatsSummarizeSingleTMP.Rd index 9d656887..f4a20dcd 100644 --- a/man/MSstatsSummarizeSingleTMP.Rd +++ b/man/MSstatsSummarizeSingleTMP.Rd @@ -8,7 +8,8 @@ MSstatsSummarizeSingleTMP( single_protein, impute, censored_symbol, - remove50missing + remove50missing, + aft_iterations ) } \arguments{ diff --git a/man/MSstatsSummarizeWithMultipleCores.Rd b/man/MSstatsSummarizeWithMultipleCores.Rd index 7387110e..ef4b20c2 100644 --- a/man/MSstatsSummarizeWithMultipleCores.Rd +++ b/man/MSstatsSummarizeWithMultipleCores.Rd @@ -11,7 +11,8 @@ MSstatsSummarizeWithMultipleCores( censored_symbol, remove50missing, equal_variance, - numberOfCores = 1 + numberOfCores = 1, + aft_iterations ) } \arguments{ diff --git a/man/MSstatsSummarizeWithSingleCore.Rd b/man/MSstatsSummarizeWithSingleCore.Rd index 84b64a8c..de793817 100644 --- a/man/MSstatsSummarizeWithSingleCore.Rd +++ b/man/MSstatsSummarizeWithSingleCore.Rd @@ -10,7 +10,8 @@ MSstatsSummarizeWithSingleCore( impute, censored_symbol, remove50missing, - equal_variance + equal_variance, + aft_iterations ) } \arguments{ diff --git a/man/dataProcess.Rd b/man/dataProcess.Rd index 39c2fa49..69be85d3 100644 --- a/man/dataProcess.Rd +++ b/man/dataProcess.Rd @@ -24,7 +24,8 @@ dataProcess( append = FALSE, verbose = TRUE, log_file_path = NULL, - numberOfCores = 1 + numberOfCores = 1, + aft_iterations = 90 ) } \arguments{ @@ -112,6 +113,8 @@ If `append = TRUE`, has to be a valid path to a file.} \item{numberOfCores}{Number of cores for parallel processing. When > 1, a logfile named `MSstats_dataProcess_log_progress.log` is created to track progress. Only works for Linux & Mac OS. Default is 1.} + +\item{aft_iterations}{Number of iterations for AFT model fitting. Default is 90.} } \value{ A list containing: diff --git a/man/dot-fitModelForGroupComparison.Rd b/man/dot-fitModelForGroupComparison.Rd index fc077940..e6cc7d9f 100644 --- a/man/dot-fitModelForGroupComparison.Rd +++ b/man/dot-fitModelForGroupComparison.Rd @@ -6,7 +6,6 @@ \usage{ .fitModelForGroupComparison( input, - weights, repeated, is_single_subject, has_tech_replicates diff --git a/man/dot-fitModelSingleProtein.Rd b/man/dot-fitModelSingleProtein.Rd index 0883dc3f..e5e19945 100644 --- a/man/dot-fitModelSingleProtein.Rd +++ b/man/dot-fitModelSingleProtein.Rd @@ -7,7 +7,6 @@ .fitModelSingleProtein( input, contrast_matrix, - weights, has_tech_replicates, is_single_subject, repeated, diff --git a/man/dot-groupComparisonWithMultipleCores.Rd b/man/dot-groupComparisonWithMultipleCores.Rd index 1753c074..f4ae9ea1 100644 --- a/man/dot-groupComparisonWithMultipleCores.Rd +++ b/man/dot-groupComparisonWithMultipleCores.Rd @@ -7,7 +7,6 @@ .groupComparisonWithMultipleCores( summarized_list, contrast_matrix, - weights, save_fitted_models, repeated, samples_info, diff --git a/man/dot-groupComparisonWithSingleCore.Rd b/man/dot-groupComparisonWithSingleCore.Rd index 481722db..f3d15e71 100644 --- a/man/dot-groupComparisonWithSingleCore.Rd +++ b/man/dot-groupComparisonWithSingleCore.Rd @@ -7,7 +7,6 @@ .groupComparisonWithSingleCore( summarized_list, contrast_matrix, - weights, save_fitted_models, repeated, samples_info diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd index d41e06e6..a0ffdfb1 100644 --- a/man/groupComparison.Rd +++ b/man/groupComparison.Rd @@ -7,7 +7,6 @@ groupComparison( contrast.matrix, data, - weights = NULL, save_fitted_models = TRUE, log_base = 2, use_log_file = TRUE, From 4655736c4fa982a22d70bb55fe8481566ab79911 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Fri, 9 May 2025 10:33:28 -0400 Subject: [PATCH 07/20] anomaly score to column validator --- R/utils_checks.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils_checks.R b/R/utils_checks.R index 6cd55e08..5ca368f5 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -166,7 +166,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { "FragmentIon", "ProductCharge") input = MSstatsConvert::MSstatsBalancedDesign( input, balanced_cols, TRUE, TRUE, fix_missing, - anomaly_metrics = c("ANOMALYSCORES")) + anomaly_metrics = c("AnomalyScores")) + input = data.table::as.data.table(unclass(input)) data.table::setnames(input, colnames(input), toupper(colnames(input))) From e14d87196ecff73e5450b9a93a748bd37857d4ad Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Mon, 12 May 2025 22:21:39 -0400 Subject: [PATCH 08/20] bug fix --- R/dataProcess.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index b08e472c..59d6896e 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -237,17 +237,20 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym } single_protein = input[protein_indices[[i]],] MSstatsSummarizeSingleLinear( - single_protein, - aft_iterations=aft_iterations, - equal_variance=equal_variance) + single_protein, + impute, + censored_symbol, + remove50missing, + aft_iterations) }) } parallel::stopCluster(cl) return(summarized_results) } else { return(MSstatsSummarizeWithSingleCore(input, method, impute, - censored_symbol, remove50missing, - equal_variance, aft_iterations)) + censored_symbol, + remove50missing, + aft_iterations)) } } @@ -301,9 +304,8 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol for (protein_id in seq_len(num_proteins)) { single_protein = input[protein_indices[[protein_id]],] summarized_result = MSstatsSummarizeSingleLinear( - single_protein, - aft_iterations=aft_iterations, - equal_variance=equal_variance) + input, method, impute, censored_symbol, + remove50missing, aft_iterations) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } @@ -363,7 +365,7 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, single_protein = proteins_list[[protein_id]] summarized_result = MSstatsSummarizeSingleLinear( single_protein, impute, censored_symbol, remove50missing, - aft_iterations, equal_variance) + aft_iterations) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } From 8ee6fd58c671228b4395e0c7ad5bbddb71c689aa Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 13 May 2025 08:42:19 -0400 Subject: [PATCH 09/20] testing remove pca --- R/dataProcess.R | 6 ++++-- R/utils_anomaly_weights.R | 27 ++++++++++----------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 59d6896e..64c65fc9 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -250,6 +250,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym return(MSstatsSummarizeWithSingleCore(input, method, impute, censored_symbol, remove50missing, + equal_variance, aft_iterations)) } } @@ -304,7 +305,7 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol for (protein_id in seq_len(num_proteins)) { single_protein = input[protein_indices[[protein_id]],] summarized_result = MSstatsSummarizeSingleLinear( - input, method, impute, censored_symbol, + input, impute, censored_symbol, remove50missing, aft_iterations) summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) @@ -450,7 +451,8 @@ MSstatsSummarizeSingleLinear = function(single_protein, } if (all(!is.na(single_protein$ANOMALYSCORES))){ - single_protein = .calculate_weights(single_protein) + # single_protein = .calculate_weights(single_protein) + single_protein$weights = 1 / single_protein$ANOMALYSCORES } else { single_protein$weights = NA } diff --git a/R/utils_anomaly_weights.R b/R/utils_anomaly_weights.R index f02c55b7..7577361f 100644 --- a/R/utils_anomaly_weights.R +++ b/R/utils_anomaly_weights.R @@ -14,37 +14,30 @@ data = .normalize_anomalies(data) - w_stats = data[, .( - shapiro_W = tryCatch( - shapiro.test(unlist(normalized_score))$statistic, - error = function(e) NA_real_ - ) - ), by = PEPTIDE] + # w_stats = data[, .( + # shapiro_W = tryCatch( + # shapiro.test(unlist(normalized_score))$statistic, + # error = function(e) NA_real_ + # ) + # ), by = PEPTIDE] # Join the W stats back into the original data.table data = merge(data, w_stats, by = "PEPTIDE", all.x = TRUE) anomaly_scores = data$normalized_score imputation_var = ifelse(data$censored, data$imputation_var, NA) - norm_score = data$shapiro_W if (sum(is.na(imputation_var)) == length(imputation_var)){ weights = anomaly_scores - # resPPCA = pca(matrix( - # c(anomaly_scores, norm_score), ncol=2), - # method="ppca", center=TRUE, nPcs=1, - # seed=1, maxIterations = 10000) - # weights = (abs(resPPCA@loadings[[1]]) * anomaly_scores - # ) + (abs(resPPCA@loadings[[2]]) * norm_score) } else { resPPCA = pca(matrix( - c(anomaly_scores, imputation_var), ncol=2), #, norm_score + c(anomaly_scores, imputation_var), ncol=2), method="ppca", center=TRUE, nPcs=1, seed=1, maxIterations = 10000) weights = (abs(resPPCA@loadings[[1]]) * anomaly_scores - ) + ifelse( - is.na(imputation_var), 0, - abs(resPPCA@loadings[[2]]) * imputation_var)# + (abs(resPPCA@loadings[[3]]) * norm_score + ) + ifelse( + is.na(imputation_var), 0, + abs(resPPCA@loadings[[2]]) * imputation_var) } data$raw_weights = weights From 2978d1824eb86ebc96c03b235fe098d479d12ec6 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 9 Jul 2025 10:54:35 -0400 Subject: [PATCH 10/20] minor bug fixes --- R/dataProcess.R | 1 - R/utils_checks.R | 4 ++++ R/utils_summarization.R | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 64c65fc9..98af35ac 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -551,7 +551,6 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, } }) - print(converged) if (converged) { single_protein[, predicted := predict(survival_fit, newdata = .SD)] } else { diff --git a/R/utils_checks.R b/R/utils_checks.R index 5ca368f5..2af35863 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -288,6 +288,10 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess) if ("TECHREPLICATE" %in% colnames(input)) { cols = unique(c(cols, "TECHREPLICATE")) } + if (!"ANOMALYSCORES" %in% colnames(input)) { + input[, ANOMALYSCORES := NA_real_] + } + input[!is.na(PROTEIN) & PROTEIN != "", cols, with = FALSE] } diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 006bf470..82ec1173 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -164,10 +164,10 @@ if (!is_labeled) { if (is_single_feature) { - linear_model = lm(newABUNDANCE ~ RUN , data = input, + linear_model = rlm(newABUNDANCE ~ RUN , data = input, weights=weight_input) } else { - linear_model = lm(newABUNDANCE ~ FEATURE + RUN, + linear_model = rlm(newABUNDANCE ~ FEATURE + RUN, data = input, weights=weight_input) } } else { From 323a9bc6e00bd8a726115eb1d9dc3cf7ca0509f6 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 9 Jul 2025 10:55:27 -0400 Subject: [PATCH 11/20] lm model --- R/utils_summarization.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 82ec1173..006bf470 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -164,10 +164,10 @@ if (!is_labeled) { if (is_single_feature) { - linear_model = rlm(newABUNDANCE ~ RUN , data = input, + linear_model = lm(newABUNDANCE ~ RUN , data = input, weights=weight_input) } else { - linear_model = rlm(newABUNDANCE ~ FEATURE + RUN, + linear_model = lm(newABUNDANCE ~ FEATURE + RUN, data = input, weights=weight_input) } } else { From 5aff1097827989a376a2825bdda2e8f48e98714e Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 15 Jul 2025 09:12:35 -0400 Subject: [PATCH 12/20] bug fix --- R/dataProcess.R | 7 ++++--- R/utils_summarization_prepare.R | 4 ++++ man/dataProcess.Rd | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 98af35ac..27e3e14c 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -31,7 +31,8 @@ #' It that case, it specifies number of top features that will be used. #' Default is 3, which means to use top 3 features. #' @param summaryMethod "TMP" (default) means Tukey's median polish, -#' which is robust estimation method. "linear" uses linear mixed model. +#' which is robust estimation method. "linear" uses linear mixed model. If +#' anomaly detection algorithm is performed, "linear" must be used. #' @param equalFeatureVar only for summaryMethod = "linear". default is TRUE. #' Logical variable for whether the model should account for heterogeneous variation #' among intensities from different features. Default is TRUE, which assume equal @@ -305,8 +306,9 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol for (protein_id in seq_len(num_proteins)) { single_protein = input[protein_indices[[protein_id]],] summarized_result = MSstatsSummarizeSingleLinear( - input, impute, censored_symbol, + single_protein, impute, censored_symbol, remove50missing, aft_iterations) + summarized_results[[protein_id]] = summarized_result setTxtProgressBar(pb, protein_id) } @@ -451,7 +453,6 @@ MSstatsSummarizeSingleLinear = function(single_protein, } if (all(!is.na(single_protein$ANOMALYSCORES))){ - # single_protein = .calculate_weights(single_protein) single_protein$weights = 1 / single_protein$ANOMALYSCORES } else { single_protein$weights = NA diff --git a/R/utils_summarization_prepare.R b/R/utils_summarization_prepare.R index a6cfc5b3..6325581b 100644 --- a/R/utils_summarization_prepare.R +++ b/R/utils_summarization_prepare.R @@ -27,6 +27,10 @@ MSstatsPrepareForSummarization = function(input, method, impute, censored_symbol remove_uninformative_feature_outlier) { ABUNDANCE = feature_quality = is_outlier = PROTEIN = NULL + if (!"ANOMALYSCORES" %in% colnames(input)) { + input[, ANOMALYSCORES := NA] + } + label = data.table::uniqueN(input$LABEL) == 2 if (label) { input[, ref := factor(ifelse(LABEL == "L", RUN, 0))] diff --git a/man/dataProcess.Rd b/man/dataProcess.Rd index 69be85d3..8bace239 100644 --- a/man/dataProcess.Rd +++ b/man/dataProcess.Rd @@ -68,7 +68,8 @@ It that case, it specifies number of top features that will be used. Default is 3, which means to use top 3 features.} \item{summaryMethod}{"TMP" (default) means Tukey's median polish, -which is robust estimation method. "linear" uses linear mixed model.} +which is robust estimation method. "linear" uses linear mixed model. If +anomaly detection algorithm is performed, "linear" must be used.} \item{equalFeatureVar}{only for summaryMethod = "linear". default is TRUE. Logical variable for whether the model should account for heterogeneous variation From aa3f5b0170d69bacf9e1fe76ea525f2fbc77704a Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 16 Jul 2025 12:23:53 -0400 Subject: [PATCH 13/20] MSstats+ vignette --- vignettes/MSstats+.Rmd | 323 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 vignettes/MSstats+.Rmd diff --git a/vignettes/MSstats+.Rmd b/vignettes/MSstats+.Rmd new file mode 100644 index 00000000..6720cbf9 --- /dev/null +++ b/vignettes/MSstats+.Rmd @@ -0,0 +1,323 @@ +--- +title: "MSstats+" +output: html_document +--- + +```{r style, echo = FALSE, results = 'asis'} +BiocStyle::markdown() +``` + +```{r global_options, include=FALSE} +knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) +options(width=110) +``` + +```{=html} + +``` + +# __MSstats+: Peak quality-weighted differential analysis__ + +Author: Devon Kohler + +`MSstats+` is an extension to the `MSstats` R package in Bioconductor. It is +designed to incorporate the underlying spectral peak quality into differential +analysis, going beyond peak intensity alone. It integrates the underlying +spectral peaks using an isolation forest (iForest) for anomaly detection, which +integrates all the peak quality metrics into a single score. This score is then +used as weights in protein-level summarization weighted linear regression. After +protein-level summarization, the estimates of underlying protein abundance and +the estimates associated variance are used as weights in a second weighted +linear model which is used to perform differential analysis. It can be used +for experiments with simple group comparison designs, or more complex designs +(e.g. comparing more than two experimental conditions, or a repeated measure +design, such as a time course). + +The algorithm is designed to be used with data acquired using data independent +acquisition (DIA). Other acquisitions are possible but have yet to be tested. +For guidance on how to use `MSstats+` with other acquisitions, please reach out +on the `MSstats` Google Group: \url{https://groups.google.com/g/msstats}. + +Below we outline the workflow for using `MSstats+`, highlighting key options +and decisions. + +## Load packages +```{r} +library(MSstats) +library(MSstatsConvert) +library(data.table) +library(ggplot2) +``` + +## Converter + +The main iForest algorithm for `MSstats+` is implemented in the `MSstatsConvert` +package (a dependency of `MSstats`). The `MSstatsConvert` package contains +individual converters for each spectral processing tool. Currently, `MSstats+` +only has a dedicated converter for `Spectronaut`, however more converters are +in development. For integration with other tools, please reach out on the +`MSstats` Google Group: \url{https://groups.google.com/g/msstats}. + +### Load data + +First we will load the Spectronaut report of identified and quantified peaks. It +is necessary that users output all the spectral peak quality data along with +these peaks. These spectral peak quality metrics must be at least at the +precursor-level (i.e., the column headers must not start with `F.`). There are +multiple options for columns that can be extracted, some +examples include: `FG.ShapeQualityScore (MS1)`, `FG.ShapeQualityScore (MS2)`, +and `EG.DeltaRT`. Users can include as many quality metrics as they want, +however we recommend using only quality metrics that are key indicators of the +quality of a spectral peak. Including too many quality metrics may lead to +overfitting of the iForest model. We include functionality to assess the +performance of the model (discussed below). + +An example output from Spectronaut is included in the +`MSstatsConvert` package to provide an idea of what the input should look like. + +In addition to the Spectronaut output, `MSstats+` also requires an annotation +file which maps the MS run names to the corresponding experimental design. For +more details on creating an annotation file, please see the documentation in +standard `MSstats` and the associated publications. + +Finally, `MSstats+` also requires the temporal order of MS run acquisition to +identify time-based drops in quality. This can be extracted from the Spectronaut +output as shown below. + +```{r} + +# Spectronaut quant file +spectronaut_raw = system.file( + "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", + package = "MSstatsConvert") +spectronaut_raw = data.table::fread(spectronaut_raw) + +# Annotation file +annotation = system.file( + "tinytest/raw_data/Spectronaut/annotation.csv", + package = "MSstatsConvert") +annotation = data.table::fread(annotation) + +# Create temporal run ordering data.table +spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( + `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] +run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", + "R.FileName")]) +run_order = run_order[order(`R.Run Date (Formatted)`)] +run_order$Order = 1:nrow(run_order) +setnames(run_order, c("R.FileName"), c("Run")) +``` + +### Run converter + +`MSstats+` uses the same converter as base `MSstats`. New parameters have been +added to indicate if a user wants to leverage the functionalities in `MSstats+`. +Specifically, to use `MSstats+`, users must indicate that the anomaly detection +algorithm should used by setting `calculateAnomalyScores=TRUE`. + +An example specification of the converter can be seen below, with key parameter +options highlighted below the code. + +```{r} +msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( + spectronaut_raw, annotation, intensity = 'PeakArea', + excludedFromQuantificationFilter = TRUE, # Pre-filtering + calculateAnomalyScores=TRUE, # Key parameter to use MSstats+ + anomalyModelFeatures=c("FGShapeQualityScore(MS2)", + "FGShapeQualityScore(MS1)", + "EGDeltaRT"), # Quality metrics + anomalyModelFeatureTemporal=c( + "mean_decrease", "mean_decrease", + "dispersion_increase"), # Temporal direction of quality metrics + runOrder=run_order, # Temporal order of MS runs + n_trees=100, + max_depth="auto", # iForest parameter (depth of trees) + numberOfCores=1) # Cores to use for parallel processing (Mac or Linux only) + +head(msstats_input) +``` + +Key parameters for `MSstats+`: +- `calculateAnomalyScores`: Set to `TRUE` to use the iForest algorithm to + calculate anomaly scores for each spectral peak. +- `anomalyModelFeatures`: A character vector of the column names of the quality metrics to be used in the iForest model. These must be at the precursor-level + or higher (i.e., the column headers must not start with `F.`). +- `anomalyModelFeatureTemporal`: A character vector of the same length as + `anomalyModelFeatures` indicating the temporal direction of each quality + metric.9 Possible values are `mean_decrease`, `mean_increase`, and + `dispersion_increase`. These indicate the direction a metric is changes which + indicates a drop in quality (i.e., decrease, increase, or can go in either + direction). For example, `EG.DeltaRT` uses `dispersion_increase` because a + deviation in expected retention can be either higher or lower. +- `runOrder`: A data.table with two columns: `R.FileName` and `Order`. The + `R.FileName` column should contain the MS run names as they appear in the + annotation file, and the `Order` column should contain the temporal order of + acquisition (1 being the first run acquired, 2 being the second, etc.). This + is used to identify time-based drops in quality. +- `n_trees`: The number of trees to use in the iForest. Default is 100. Too few + trees will make the algorithm unstable, while too many trees will increase + computation time without performance improvement. +- `max_depth`: The maximum depth of each tree in the iForest. Default is + `"auto"`, which sets the depth to `ceiling(log2(number of samples))`. +- `numberOfCores`: The number of cores to use for parallel processing. This is + recommended since the algorithm is fit on the precursor-level and can take + significant time to process if using a single core. Only available on Mac and + Linux. + +### Check model fit + +After running the converter, it is important to check the fit of the iForest. +The iForest is an unsupervised algorithm, so there is no ground truth to compare +to. However, we can check the distribution of the anomaly scores to ensure that +the model is performing as expected. The anomaly scores should be right-skewed, +with most of the scores being low (indicating normal peaks) and a small number +of scores being high (indicating anomalous peaks). If the distribution is not +right-skewed, it may indicate that the quality metrics used are not appropriate, +or that there are not many poor quality peaks (generally found in smaller +studies using less complex biological material). In these cases, it may make +sense to use base `MSstats` instead of `MSstats+`. + +The fit of the model can be evaluated using the `CheckDataHealth` function in `MSstatsConvert`. This function will calculate Pearson's moment coefficient of +skewness for each iForest model fit (each precursor). We recommend plotting +these coefficients using a histogram and confirming the distribution is positive +and shifted away from zero (with a median around 1). + +An example of how to use the function is shown below. + +```{r} +health_info = CheckDataHealth(msstats_input) +skew_score = health_info[[2]] + +ggplot(skew_score, aes(x = skew)) + + geom_histogram(fill = "#E69F00", color = "black", bins=12) + + geom_vline(xintercept = 0, linetype = "dashed", + color = "black", linewidth = 1.5) + + theme_minimal(base_size = 16) + + labs( + title="Distribution of skewness of anomaly scores", + x = "Pearson's moment coefficient of skewness", + y = "Count" + ) + + theme( + axis.text.x = element_text(size = 16), + axis.text.y = element_text(size = 16), + axis.title.x = element_text(size = 18), + axis.title.y = element_text(size = 18) + ) + +``` + +We can see in the example data that the distribution is shifted right away from +zero. This indicates that the anomaly model was able to detect poor quality +measurements and the user can proceed with the next step. + +## Data processing and summarization + +After running the converter with `MSstats+` options enabled, the next step is to +run the standard `MSstats` `dataProcess` function. This function will +automatically leverage the information from the `MSstats+` option, as long as +the linear summarization option is turned on. The code to do this is highlighted +below. + +```{r} +summarized = dataProcess(msstats_input, + normalization=FALSE, # no normalization + MBimpute=TRUE, # Use imputation + summaryMethod="linear" # Key MSstats+ parameter + ) + +head(summarized$ProteinLevelData) +``` + +Key parameters for `MSstats+`: +- `summaryMethod`: Set to `linear` to use the peak quality-weighted linear + regression summarization. This is a key parameter for `MSstats+`. Other + summarization methods (e.g., Tukey's median polish) will not leverage the + peak quality information. + +When the `dataProcess` function is run with `summaryMethod=linear` a new column +will be included called `Variance` indicating the variance of the underlying +protein abundance estimate. This variance is used in the next step of the +workflow. + +## Differential analysis + +The final step of the `MSstats+` workflow is to perform differential analysis. +This is done using the standard `groupComparison` function in `MSstats`. Similar +to the `dataProcess` function, the `groupComparison` function will automatically +leverage the information from the `MSstats+` option. The code to do this is highlighted below. + +```{r} +comparison = matrix(c(-1,1),nrow=1) +row.names(comparison) = "Condition2-Condition1" +colnames(comparison) = c("Condition1", "Condition2") + +comparison_result = groupComparison( + contrast.matrix = comparison, + data=summarized) + +head(comparison_result$ComparisonResult) +``` + +The `groupComparison` function is run the exact same in `MSstats+` as `MSstats`, +but under the hood the variation from the previous step is used in a second +weighted linear regression (in comparison to the unweighted version in +`MSstats`). + +## Compare results to base `MSstats` + +```{r} +base_msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( + spectronaut_raw, annotation, intensity = 'PeakArea', + excludedFromQuantificationFilter = TRUE, # Pre-filtering + calculateAnomalyScores=FALSE) # No anomaly model + +base_summarized = dataProcess(base_msstats_input, + normalization=FALSE, # no normalization + MBimpute=TRUE, # Use imputation + summaryMethod="TMP" # Tukey median polish summarization + ) + +base_comparison_result = groupComparison( + contrast.matrix = comparison, + data=base_summarized) + +head(base_comparison_result$ComparisonResult) +``` + +We can see that `MSstats` does not identify all the proteins as differentially abundant, even though there was a true log$_2$ fold change of -1. We can compare the fold change and standard error estimates between `MSstats+` and base `MSstats` to see why the p-values are different. + +```{r} +comparison_result$ComparisonResult$Model = "MSstats+" +base_comparison_result$ComparisonResult$Model = "MSstats" + +merged_results = rbindlist(list( + comparison_result$ComparisonResult, + base_comparison_result$ComparisonResult +)) + +ggplot(data=merged_results) + + geom_col(aes(x=Protein, y=abs(log2FC), fill=Model), position="dodge") + + geom_hline(yintercept=1, linetype="dashed", color="black", linewidth=1) + + theme_bw(base_size =16) + + labs(title="Log fold change comparison", + x="MSstats+", + y="Base MSstats") +``` + +The fold changes are both near the ground truth and `MSstats+` does not clearly outperform base `MSstats`. + +```{r} +ggplot(data=merged_results) + + geom_col(aes(x=Protein, y=SE, fill=Model), position="dodge") + + theme_bw(base_size =16) + + labs(title="Standard error comparison", + x="MSstats+", + y="Base MSstats") +``` + +We can see the advantage of `MSstats+` in the standard error estimates. `MSstats+` downweights poorly quantified measurements and runs, which can lead to much better estimates of the true standard error. In this example `MSstats+` downweighted the runs which added a large amount of uncertainty, leading to it identifying all proteins as differentially abundant. + From be2a8d6c2b821e0d3fc2ce0b88f0f82e828edc32 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Fri, 25 Jul 2025 11:40:16 -0400 Subject: [PATCH 14/20] added manual converter example --- vignettes/MSstatsPlus.Rmd | 377 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) create mode 100644 vignettes/MSstatsPlus.Rmd diff --git a/vignettes/MSstatsPlus.Rmd b/vignettes/MSstatsPlus.Rmd new file mode 100644 index 00000000..3f8f0230 --- /dev/null +++ b/vignettes/MSstatsPlus.Rmd @@ -0,0 +1,377 @@ +```{r style, echo = FALSE, results = 'asis'} +BiocStyle::markdown() +``` + +```{r global_options, include=FALSE} +knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) +options(width=110) +``` + + +# __MSstats+: Peak quality-weighted differential analysis__ + +Author: Devon Kohler + +`MSstats+` is an extension to the `MSstats` R package in Bioconductor. It is +designed to incorporate the underlying spectral peak quality into differential +analysis, going beyond peak intensity alone. It integrates the underlying +spectral peaks using an isolation forest (iForest) for anomaly detection, which +integrates all the peak quality metrics into a single score. This score is then +used as weights in protein-level summarization weighted linear regression. After +protein-level summarization, the estimates of underlying protein abundance and +the estimates associated variance are used as weights in a second weighted +linear model which is used to perform differential analysis. It can be used +for experiments with simple group comparison designs, or more complex designs +(e.g. comparing more than two experimental conditions, or a repeated measure +design, such as a time course). + +The algorithm is designed to be used with data acquired using data independent +acquisition (DIA). Other acquisitions are possible but have yet to be tested. +For guidance on how to use `MSstats+` with other acquisitions, please reach out +on the `MSstats` Google Group: \url{https://groups.google.com/g/msstats}. + +Below we outline the workflow for using `MSstats+`, highlighting key options +and decisions. + +## Load packages +```{r} +library(MSstats) +library(MSstatsConvert) +library(data.table) +library(ggplot2) +``` + +## Converter + +The main iForest algorithm for `MSstats+` is implemented in the `MSstatsConvert` +package (a dependency of `MSstats`). The `MSstatsConvert` package contains +individual converters for each spectral processing tool. Currently, `MSstats+` +only has a dedicated converter for `Spectronaut`, however more converters are +in development. For integration with other tools, please reach out on the +`MSstats` Google Group: \url{https://groups.google.com/g/msstats}. + +### Load data + +First we will load the Spectronaut report of identified and quantified peaks. It +is necessary that users output all the spectral peak quality data along with +these peaks. These spectral peak quality metrics must be at least at the +precursor-level (i.e., the column headers must not start with `F.`). There are +multiple options for columns that can be extracted, some +examples include: `FG.ShapeQualityScore (MS1)`, `FG.ShapeQualityScore (MS2)`, +and `EG.DeltaRT`. Users can include as many quality metrics as they want, +however we recommend using only quality metrics that are key indicators of the +quality of a spectral peak. Including too many quality metrics may lead to +overfitting of the iForest model. We include functionality to assess the +performance of the model (discussed below). + +An example output from Spectronaut is included in the +`MSstatsConvert` package to provide an idea of what the input should look like. + +In addition to the Spectronaut output, `MSstats+` also requires an annotation +file which maps the MS run names to the corresponding experimental design. For +more details on creating an annotation file, please see the documentation in +standard `MSstats` and the associated publications. + +Finally, `MSstats+` also requires the temporal order of MS run acquisition to +identify time-based drops in quality. This can be extracted from the Spectronaut +output as shown below. + +```{r} +# Spectronaut quant file +spectronaut_raw = system.file( + "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", + package = "MSstatsConvert") +spectronaut_raw = data.table::fread(spectronaut_raw) + +# Annotation file +annotation = system.file( + "tinytest/raw_data/Spectronaut/annotation.csv", + package = "MSstatsConvert") +annotation = data.table::fread(annotation) + +# Create temporal run ordering data.table +spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( + `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] +run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", + "R.FileName")]) +run_order = run_order[order(`R.Run Date (Formatted)`)] +run_order$Order = 1:nrow(run_order) +setnames(run_order, c("R.FileName"), c("Run")) +``` + +### Run converter + +`MSstats+` uses the same converter as base `MSstats`. New parameters have been +added to indicate if a user wants to leverage the functionalities in `MSstats+`. +Specifically, to use `MSstats+`, users must indicate that the anomaly detection +algorithm should used by setting `calculateAnomalyScores=TRUE`. + +An example specification of the converter can be seen below, with key parameter +options highlighted below the code. + +```{r} +msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( + spectronaut_raw, annotation, intensity = 'PeakArea', + excludedFromQuantificationFilter = TRUE, # Pre-filtering + calculateAnomalyScores=TRUE, # Key parameter to use MSstats+ + anomalyModelFeatures=c("FGShapeQualityScore(MS2)", + "FGShapeQualityScore(MS1)", + "EGDeltaRT"), # Quality metrics + anomalyModelFeatureTemporal=c( + "mean_decrease", "mean_decrease", + "dispersion_increase"), # Temporal direction of quality metrics + runOrder=run_order, # Temporal order of MS runs + n_trees=100, + max_depth="auto", # iForest parameter (depth of trees) + numberOfCores=1) # Cores to use for parallel processing (Mac or Linux only) + +head(msstats_input) +``` + +Key parameters for `MSstats+`: +- `calculateAnomalyScores`: Set to `TRUE` to use the iForest algorithm to + calculate anomaly scores for each spectral peak. +- `anomalyModelFeatures`: A character vector of the column names of the quality metrics to be used in the iForest model. These must be at the precursor-level + or higher (i.e., the column headers must not start with `F.`). +- `anomalyModelFeatureTemporal`: A character vector of the same length as + `anomalyModelFeatures` indicating the temporal direction of each quality + metric.9 Possible values are `mean_decrease`, `mean_increase`, and + `dispersion_increase`. These indicate the direction a metric is changes which + indicates a drop in quality (i.e., decrease, increase, or can go in either + direction). For example, `EG.DeltaRT` uses `dispersion_increase` because a + deviation in expected retention can be either higher or lower. +- `runOrder`: A data.table with two columns: `R.FileName` and `Order`. The + `R.FileName` column should contain the MS run names as they appear in the + annotation file, and the `Order` column should contain the temporal order of + acquisition (1 being the first run acquired, 2 being the second, etc.). This + is used to identify time-based drops in quality. +- `n_trees`: The number of trees to use in the iForest. Default is 100. Too few + trees will make the algorithm unstable, while too many trees will increase + computation time without performance improvement. +- `max_depth`: The maximum depth of each tree in the iForest. Default is + `"auto"`, which sets the depth to `ceiling(log2(number of samples))`. +- `numberOfCores`: The number of cores to use for parallel processing. This is + recommended since the algorithm is fit on the precursor-level and can take + significant time to process if using a single core. Only available on Mac and + Linux. + +### Check model fit + +After running the converter, it is important to check the fit of the iForest. +The iForest is an unsupervised algorithm, so there is no ground truth to compare +to. However, we can check the distribution of the anomaly scores to ensure that +the model is performing as expected. The anomaly scores should be right-skewed, +with most of the scores being low (indicating normal peaks) and a small number +of scores being high (indicating anomalous peaks). If the distribution is not +right-skewed, it may indicate that the quality metrics used are not appropriate, +or that there are not many poor quality peaks (generally found in smaller +studies using less complex biological material). In these cases, it may make +sense to use base `MSstats` instead of `MSstats+`. + +The fit of the model can be evaluated using the `CheckDataHealth` function in `MSstatsConvert`. This function will calculate Pearson's moment coefficient of +skewness for each iForest model fit (each precursor). We recommend plotting +these coefficients using a histogram and confirming the distribution is positive +and shifted away from zero (with a median around 1). + +An example of how to use the function is shown below. + +```{r} +health_info = CheckDataHealth(msstats_input) +skew_score = health_info[[2]] + +ggplot(skew_score, aes(x = skew)) + + geom_histogram(fill = "#E69F00", color = "black", bins=12) + + geom_vline(xintercept = 0, linetype = "dashed", + color = "black", linewidth = 1.5) + + theme_minimal(base_size = 16) + + labs( + title="Distribution of skewness of anomaly scores", + x = "Pearson's moment coefficient of skewness", + y = "Count" + ) + + theme( + axis.text.x = element_text(size = 16), + axis.text.y = element_text(size = 16), + axis.title.x = element_text(size = 18), + axis.title.y = element_text(size = 18) + ) + +``` + +We can see in the example data that the distribution is shifted right away from +zero. This indicates that the anomaly model was able to detect poor quality +measurements and the user can proceed with the next step. + +## Manual formatting and data conversion + +The `MSstatsConvert` package includes an automatic `MSstats+` converter for +Spectronaut, however other tools require manual formatting. This can be done by +leveraging the existing `MSstats` converters, joining the quality metrics, and +running the anomaly model separately. An example workflow is included below. + +```{r} + +# Extract quality metrics from PSM output +quality_join_cols = c("R.FileName", "PG.ProteinGroups", "EG.ModifiedSequence", + "FG.Charge", "FG.ShapeQualityScore (MS1)", + "FG.ShapeQualityScore (MS2)", "EG.DeltaRT") +quality_join_df = spectronaut_raw[, ..quality_join_cols] +quality_join_df$EG.ModifiedSequence = stringr::str_replace_all( + quality_join_df$EG.ModifiedSequence, "\\_", "") +quality_join_df = unique(quality_join_df) + +# In case there are duplicates (not common) +quality_join_df = quality_join_df[ + , lapply(.SD, mean, na.rm = TRUE), + by = .(R.FileName, PG.ProteinGroups, EG.ModifiedSequence, FG.Charge), + .SDcols = c("FG.ShapeQualityScore (MS1)", + "FG.ShapeQualityScore (MS2)", + "EG.DeltaRT") +] + +# Extract run order +spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( + `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] +run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", + "R.FileName")]) +run_order = run_order[order(`R.Run Date (Formatted)`)] +run_order$Order = 1:nrow(run_order) +setnames(run_order, c("R.FileName"), c("Run")) + +# Run standard MSstats converter without anomaly detection +msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( + spectronaut_raw, annotation, intensity = 'PeakArea', + excludedFromQuantificationFilter = TRUE, # Pre-filtering + calculateAnomalyScores=FALSE) # Turn anomaly detection off + +# Join in quality metrics +msstats_input = merge(as.data.table(msstats_input), + as.data.table(quality_join_df), + all.x=TRUE, all.y=FALSE, + by.x=c("Run", "ProteinName", "PeptideSequence", + "PrecursorCharge"), + by.y=c("R.FileName", "PG.ProteinGroups", + "EG.ModifiedSequence", "FG.Charge")) + +# Run Anomaly Model separately +input = MSstatsConvert::MSstatsAnomalyScores( + msstats_input, + c("FG.ShapeQualityScore (MS1)", "FG.ShapeQualityScore (MS2)", "EG.DeltaRT"), + c("mean_decrease", "mean_decrease", "dispersion_increase"), + .5, 100, run_order, 100, "auto", 1) +``` + +The resulting data can now be passed into the remaining `MSstats+` workflow as +listed below. + +## Data processing and summarization + +After running the converter with `MSstats+` options enabled, the next step is to +run the standard `MSstats` `dataProcess` function. This function will +automatically leverage the information from the `MSstats+` option, as long as +the linear summarization option is turned on. The code to do this is highlighted +below. + +```{r} +summarized = dataProcess(msstats_input, + normalization=FALSE, # no normalization + MBimpute=TRUE, # Use imputation + summaryMethod="linear" # Key MSstats+ parameter + ) + +head(summarized$ProteinLevelData) +``` + +Key parameters for `MSstats+`: +- `summaryMethod`: Set to `linear` to use the peak quality-weighted linear + regression summarization. This is a key parameter for `MSstats+`. Other + summarization methods (e.g., Tukey's median polish) will not leverage the + peak quality information. + +When the `dataProcess` function is run with `summaryMethod=linear` a new column +will be included called `Variance` indicating the variance of the underlying +protein abundance estimate. This variance is used in the next step of the +workflow. + +## Differential analysis + +The final step of the `MSstats+` workflow is to perform differential analysis. +This is done using the standard `groupComparison` function in `MSstats`. Similar +to the `dataProcess` function, the `groupComparison` function will automatically +leverage the information from the `MSstats+` option. The code to do this is highlighted below. + +```{r} +comparison = matrix(c(-1,1),nrow=1) +row.names(comparison) = "Condition2-Condition1" +colnames(comparison) = c("Condition1", "Condition2") + +comparison_result = groupComparison( + contrast.matrix = comparison, + data=summarized) + +head(comparison_result$ComparisonResult) +``` + +The `groupComparison` function is run the exact same in `MSstats+` as `MSstats`, +but under the hood the variation from the previous step is used in a second +weighted linear regression (in comparison to the unweighted version in +`MSstats`). + +## Compare results to base `MSstats` + +```{r} +base_msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( + spectronaut_raw, annotation, intensity = 'PeakArea', + excludedFromQuantificationFilter = TRUE, # Pre-filtering + calculateAnomalyScores=FALSE) # No anomaly model + +base_summarized = dataProcess(base_msstats_input, + normalization=FALSE, # no normalization + MBimpute=TRUE, # Use imputation + summaryMethod="TMP" # Tukey median polish summarization + ) + +base_comparison_result = groupComparison( + contrast.matrix = comparison, + data=base_summarized) + +head(base_comparison_result$ComparisonResult) +``` + +We can see that `MSstats` does not identify all the proteins as differentially abundant, even though there was a true log$_2$ fold change of -1. We can compare the fold change and standard error estimates between `MSstats+` and base `MSstats` to see why the p-values are different. + +```{r} +comparison_result$ComparisonResult$Model = "MSstats+" +base_comparison_result$ComparisonResult$Model = "MSstats" + +merged_results = rbindlist(list( + comparison_result$ComparisonResult, + base_comparison_result$ComparisonResult +)) + +ggplot(data=merged_results) + + geom_col(aes(x=Protein, y=abs(log2FC), fill=Model), position="dodge") + + geom_hline(yintercept=1, linetype="dashed", color="black", linewidth=1) + + theme_bw(base_size =16) + + labs(title="Log fold change comparison", + x="MSstats+", + y="Base MSstats") +``` + +The fold changes are both near the ground truth and `MSstats+` does not clearly outperform base `MSstats`. + +```{r} +ggplot(data=merged_results) + + geom_col(aes(x=Protein, y=SE, fill=Model), position="dodge") + + theme_bw(base_size =16) + + labs(title="Standard error comparison", + x="MSstats+", + y="Base MSstats") +``` + +We can see the advantage of `MSstats+` in the standard error estimates. `MSstats+` downweights poorly quantified measurements and runs, which can lead to much better estimates of the true standard error. In this example `MSstats+` downweighted the runs which added a large amount of uncertainty, leading to it identifying all proteins as differentially abundant. + + From f7a957b66c45d021e7946ae61385382edcfb2cb7 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Fri, 25 Jul 2025 11:40:53 -0400 Subject: [PATCH 15/20] documentation bugs --- NAMESPACE | 1 - R/dataProcess.R | 9 +- R/utils_groupcomparison.R | 2 +- R/utils_output.R | 2 +- man/MSstatsSummarizationOutput.Rd | 2 +- man/MSstatsSummarizeSingleLinear.Rd | 2 +- man/MSstatsSummarizeSingleTMP.Rd | 2 +- man/MSstatsSummarizeWithSingleCore.Rd | 2 +- vignettes/MSstats+.Rmd | 323 -------------------------- 9 files changed, 10 insertions(+), 335 deletions(-) delete mode 100644 vignettes/MSstats+.Rmd diff --git a/NAMESPACE b/NAMESPACE index 3fe088d2..4bc5cc3a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,7 +100,6 @@ importFrom(parallel,clusterExport) importFrom(parallel,makeCluster) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) -importFrom(pcaMethods,pca) importFrom(plotly,add_trace) importFrom(plotly,ggplotly) importFrom(plotly,layout) diff --git a/R/dataProcess.R b/R/dataProcess.R index 27e3e14c..262e9991 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -197,7 +197,7 @@ dataProcess = function( #' a logfile named `MSstats_dataProcess_log_progress.log` is created to #' track progress. Only works for Linux & Mac OS. Default is 1. #' -#' @importFrom parallel makeCluster parLapply stopCluster clusterExport +#' @importFrom parallel makeCluster parLapply stopCluster clusterExport #' #' @return list of length one with run-level data. #' @@ -280,7 +280,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym #' input = MSstatsSelectFeatures(input, "all") #' processed = getProcessed(input) #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) -#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE) +#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE, 100) #' length(summarized) # list of summarization outputs for each protein #' head(summarized[[1]][[1]]) # run-level summary #' @@ -395,7 +395,6 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, #' @return list with protein-level data #' #' @importFrom stats xtabs -#' @importFrom pcaMethods pca #' #' @export #' @@ -413,7 +412,7 @@ MSstatsSummarize = function(proteins_list, method, impute, censored_symbol, #' input = MSstatsSelectFeatures(input, "all") #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) #' input_split = split(input, input$PROTEIN) -#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]]) +#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]], impute, cens, TRUE, 100) #' head(single_protein_summary[[1]]) #' MSstatsSummarizeSingleLinear = function(single_protein, @@ -520,7 +519,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) #' input_split = split(input, input$PROTEIN) #' single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]], -#' impute, cens, FALSE) +#' impute, cens, FALSE, 100) #' head(single_protein_summary[[1]]) #' MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, diff --git a/R/utils_groupcomparison.R b/R/utils_groupcomparison.R index 62a9dde5..d9a00d01 100644 --- a/R/utils_groupcomparison.R +++ b/R/utils_groupcomparison.R @@ -177,7 +177,7 @@ getSamplesInfo = function(summarization_output) { )) df_full = suppressMessages(try( lm(ABUNDANCE ~ GROUP + SUBJECT, - data = input, weights=weights)$df.residual, + data = input, weights=weight_input)$df.residual, TRUE)) } else { full_fit = suppressMessages(try( diff --git a/R/utils_output.R b/R/utils_output.R index 2a185155..56fc067b 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -32,7 +32,7 @@ #' input = MSstatsSelectFeatures(input, "all") #' processed = getProcessed(input) #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) -#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE) +#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE, 100) #' output = output = MSstatsSummarizationOutput(input, summarized, processed, #' method, impute, cens) #' diff --git a/man/MSstatsSummarizationOutput.Rd b/man/MSstatsSummarizationOutput.Rd index 280087c0..ff06aa9c 100644 --- a/man/MSstatsSummarizationOutput.Rd +++ b/man/MSstatsSummarizationOutput.Rd @@ -53,7 +53,7 @@ input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) input = MSstatsSelectFeatures(input, "all") processed = getProcessed(input) input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) -summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE) +summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE, 100) output = output = MSstatsSummarizationOutput(input, summarized, processed, method, impute, cens) diff --git a/man/MSstatsSummarizeSingleLinear.Rd b/man/MSstatsSummarizeSingleLinear.Rd index f30bca6b..86e19542 100644 --- a/man/MSstatsSummarizeSingleLinear.Rd +++ b/man/MSstatsSummarizeSingleLinear.Rd @@ -38,7 +38,7 @@ input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) input = MSstatsSelectFeatures(input, "all") input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) input_split = split(input, input$PROTEIN) -single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]]) +single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]], impute, cens, TRUE, 100) head(single_protein_summary[[1]]) } diff --git a/man/MSstatsSummarizeSingleTMP.Rd b/man/MSstatsSummarizeSingleTMP.Rd index f4a20dcd..7ec2c7c9 100644 --- a/man/MSstatsSummarizeSingleTMP.Rd +++ b/man/MSstatsSummarizeSingleTMP.Rd @@ -51,7 +51,7 @@ input = MSstatsSelectFeatures(input, "all") input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) input_split = split(input, input$PROTEIN) single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]], - impute, cens, FALSE) + impute, cens, FALSE, 100) head(single_protein_summary[[1]]) } diff --git a/man/MSstatsSummarizeWithSingleCore.Rd b/man/MSstatsSummarizeWithSingleCore.Rd index de793817..6998458b 100644 --- a/man/MSstatsSummarizeWithSingleCore.Rd +++ b/man/MSstatsSummarizeWithSingleCore.Rd @@ -59,7 +59,7 @@ input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) input = MSstatsSelectFeatures(input, "all") processed = getProcessed(input) input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE) -summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE) +summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE, 100) length(summarized) # list of summarization outputs for each protein head(summarized[[1]][[1]]) # run-level summary diff --git a/vignettes/MSstats+.Rmd b/vignettes/MSstats+.Rmd deleted file mode 100644 index 6720cbf9..00000000 --- a/vignettes/MSstats+.Rmd +++ /dev/null @@ -1,323 +0,0 @@ ---- -title: "MSstats+" -output: html_document ---- - -```{r style, echo = FALSE, results = 'asis'} -BiocStyle::markdown() -``` - -```{r global_options, include=FALSE} -knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE) -options(width=110) -``` - -```{=html} - -``` - -# __MSstats+: Peak quality-weighted differential analysis__ - -Author: Devon Kohler - -`MSstats+` is an extension to the `MSstats` R package in Bioconductor. It is -designed to incorporate the underlying spectral peak quality into differential -analysis, going beyond peak intensity alone. It integrates the underlying -spectral peaks using an isolation forest (iForest) for anomaly detection, which -integrates all the peak quality metrics into a single score. This score is then -used as weights in protein-level summarization weighted linear regression. After -protein-level summarization, the estimates of underlying protein abundance and -the estimates associated variance are used as weights in a second weighted -linear model which is used to perform differential analysis. It can be used -for experiments with simple group comparison designs, or more complex designs -(e.g. comparing more than two experimental conditions, or a repeated measure -design, such as a time course). - -The algorithm is designed to be used with data acquired using data independent -acquisition (DIA). Other acquisitions are possible but have yet to be tested. -For guidance on how to use `MSstats+` with other acquisitions, please reach out -on the `MSstats` Google Group: \url{https://groups.google.com/g/msstats}. - -Below we outline the workflow for using `MSstats+`, highlighting key options -and decisions. - -## Load packages -```{r} -library(MSstats) -library(MSstatsConvert) -library(data.table) -library(ggplot2) -``` - -## Converter - -The main iForest algorithm for `MSstats+` is implemented in the `MSstatsConvert` -package (a dependency of `MSstats`). The `MSstatsConvert` package contains -individual converters for each spectral processing tool. Currently, `MSstats+` -only has a dedicated converter for `Spectronaut`, however more converters are -in development. For integration with other tools, please reach out on the -`MSstats` Google Group: \url{https://groups.google.com/g/msstats}. - -### Load data - -First we will load the Spectronaut report of identified and quantified peaks. It -is necessary that users output all the spectral peak quality data along with -these peaks. These spectral peak quality metrics must be at least at the -precursor-level (i.e., the column headers must not start with `F.`). There are -multiple options for columns that can be extracted, some -examples include: `FG.ShapeQualityScore (MS1)`, `FG.ShapeQualityScore (MS2)`, -and `EG.DeltaRT`. Users can include as many quality metrics as they want, -however we recommend using only quality metrics that are key indicators of the -quality of a spectral peak. Including too many quality metrics may lead to -overfitting of the iForest model. We include functionality to assess the -performance of the model (discussed below). - -An example output from Spectronaut is included in the -`MSstatsConvert` package to provide an idea of what the input should look like. - -In addition to the Spectronaut output, `MSstats+` also requires an annotation -file which maps the MS run names to the corresponding experimental design. For -more details on creating an annotation file, please see the documentation in -standard `MSstats` and the associated publications. - -Finally, `MSstats+` also requires the temporal order of MS run acquisition to -identify time-based drops in quality. This can be extracted from the Spectronaut -output as shown below. - -```{r} - -# Spectronaut quant file -spectronaut_raw = system.file( - "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", - package = "MSstatsConvert") -spectronaut_raw = data.table::fread(spectronaut_raw) - -# Annotation file -annotation = system.file( - "tinytest/raw_data/Spectronaut/annotation.csv", - package = "MSstatsConvert") -annotation = data.table::fread(annotation) - -# Create temporal run ordering data.table -spectronaut_raw[, `R.Run Date (Formatted)` := as.POSIXct( - `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")] -run_order = unique(spectronaut_raw[, c("R.Run Date (Formatted)", - "R.FileName")]) -run_order = run_order[order(`R.Run Date (Formatted)`)] -run_order$Order = 1:nrow(run_order) -setnames(run_order, c("R.FileName"), c("Run")) -``` - -### Run converter - -`MSstats+` uses the same converter as base `MSstats`. New parameters have been -added to indicate if a user wants to leverage the functionalities in `MSstats+`. -Specifically, to use `MSstats+`, users must indicate that the anomaly detection -algorithm should used by setting `calculateAnomalyScores=TRUE`. - -An example specification of the converter can be seen below, with key parameter -options highlighted below the code. - -```{r} -msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( - spectronaut_raw, annotation, intensity = 'PeakArea', - excludedFromQuantificationFilter = TRUE, # Pre-filtering - calculateAnomalyScores=TRUE, # Key parameter to use MSstats+ - anomalyModelFeatures=c("FGShapeQualityScore(MS2)", - "FGShapeQualityScore(MS1)", - "EGDeltaRT"), # Quality metrics - anomalyModelFeatureTemporal=c( - "mean_decrease", "mean_decrease", - "dispersion_increase"), # Temporal direction of quality metrics - runOrder=run_order, # Temporal order of MS runs - n_trees=100, - max_depth="auto", # iForest parameter (depth of trees) - numberOfCores=1) # Cores to use for parallel processing (Mac or Linux only) - -head(msstats_input) -``` - -Key parameters for `MSstats+`: -- `calculateAnomalyScores`: Set to `TRUE` to use the iForest algorithm to - calculate anomaly scores for each spectral peak. -- `anomalyModelFeatures`: A character vector of the column names of the quality metrics to be used in the iForest model. These must be at the precursor-level - or higher (i.e., the column headers must not start with `F.`). -- `anomalyModelFeatureTemporal`: A character vector of the same length as - `anomalyModelFeatures` indicating the temporal direction of each quality - metric.9 Possible values are `mean_decrease`, `mean_increase`, and - `dispersion_increase`. These indicate the direction a metric is changes which - indicates a drop in quality (i.e., decrease, increase, or can go in either - direction). For example, `EG.DeltaRT` uses `dispersion_increase` because a - deviation in expected retention can be either higher or lower. -- `runOrder`: A data.table with two columns: `R.FileName` and `Order`. The - `R.FileName` column should contain the MS run names as they appear in the - annotation file, and the `Order` column should contain the temporal order of - acquisition (1 being the first run acquired, 2 being the second, etc.). This - is used to identify time-based drops in quality. -- `n_trees`: The number of trees to use in the iForest. Default is 100. Too few - trees will make the algorithm unstable, while too many trees will increase - computation time without performance improvement. -- `max_depth`: The maximum depth of each tree in the iForest. Default is - `"auto"`, which sets the depth to `ceiling(log2(number of samples))`. -- `numberOfCores`: The number of cores to use for parallel processing. This is - recommended since the algorithm is fit on the precursor-level and can take - significant time to process if using a single core. Only available on Mac and - Linux. - -### Check model fit - -After running the converter, it is important to check the fit of the iForest. -The iForest is an unsupervised algorithm, so there is no ground truth to compare -to. However, we can check the distribution of the anomaly scores to ensure that -the model is performing as expected. The anomaly scores should be right-skewed, -with most of the scores being low (indicating normal peaks) and a small number -of scores being high (indicating anomalous peaks). If the distribution is not -right-skewed, it may indicate that the quality metrics used are not appropriate, -or that there are not many poor quality peaks (generally found in smaller -studies using less complex biological material). In these cases, it may make -sense to use base `MSstats` instead of `MSstats+`. - -The fit of the model can be evaluated using the `CheckDataHealth` function in `MSstatsConvert`. This function will calculate Pearson's moment coefficient of -skewness for each iForest model fit (each precursor). We recommend plotting -these coefficients using a histogram and confirming the distribution is positive -and shifted away from zero (with a median around 1). - -An example of how to use the function is shown below. - -```{r} -health_info = CheckDataHealth(msstats_input) -skew_score = health_info[[2]] - -ggplot(skew_score, aes(x = skew)) + - geom_histogram(fill = "#E69F00", color = "black", bins=12) + - geom_vline(xintercept = 0, linetype = "dashed", - color = "black", linewidth = 1.5) + - theme_minimal(base_size = 16) + - labs( - title="Distribution of skewness of anomaly scores", - x = "Pearson's moment coefficient of skewness", - y = "Count" - ) + - theme( - axis.text.x = element_text(size = 16), - axis.text.y = element_text(size = 16), - axis.title.x = element_text(size = 18), - axis.title.y = element_text(size = 18) - ) - -``` - -We can see in the example data that the distribution is shifted right away from -zero. This indicates that the anomaly model was able to detect poor quality -measurements and the user can proceed with the next step. - -## Data processing and summarization - -After running the converter with `MSstats+` options enabled, the next step is to -run the standard `MSstats` `dataProcess` function. This function will -automatically leverage the information from the `MSstats+` option, as long as -the linear summarization option is turned on. The code to do this is highlighted -below. - -```{r} -summarized = dataProcess(msstats_input, - normalization=FALSE, # no normalization - MBimpute=TRUE, # Use imputation - summaryMethod="linear" # Key MSstats+ parameter - ) - -head(summarized$ProteinLevelData) -``` - -Key parameters for `MSstats+`: -- `summaryMethod`: Set to `linear` to use the peak quality-weighted linear - regression summarization. This is a key parameter for `MSstats+`. Other - summarization methods (e.g., Tukey's median polish) will not leverage the - peak quality information. - -When the `dataProcess` function is run with `summaryMethod=linear` a new column -will be included called `Variance` indicating the variance of the underlying -protein abundance estimate. This variance is used in the next step of the -workflow. - -## Differential analysis - -The final step of the `MSstats+` workflow is to perform differential analysis. -This is done using the standard `groupComparison` function in `MSstats`. Similar -to the `dataProcess` function, the `groupComparison` function will automatically -leverage the information from the `MSstats+` option. The code to do this is highlighted below. - -```{r} -comparison = matrix(c(-1,1),nrow=1) -row.names(comparison) = "Condition2-Condition1" -colnames(comparison) = c("Condition1", "Condition2") - -comparison_result = groupComparison( - contrast.matrix = comparison, - data=summarized) - -head(comparison_result$ComparisonResult) -``` - -The `groupComparison` function is run the exact same in `MSstats+` as `MSstats`, -but under the hood the variation from the previous step is used in a second -weighted linear regression (in comparison to the unweighted version in -`MSstats`). - -## Compare results to base `MSstats` - -```{r} -base_msstats_input = MSstatsConvert::SpectronauttoMSstatsFormat( - spectronaut_raw, annotation, intensity = 'PeakArea', - excludedFromQuantificationFilter = TRUE, # Pre-filtering - calculateAnomalyScores=FALSE) # No anomaly model - -base_summarized = dataProcess(base_msstats_input, - normalization=FALSE, # no normalization - MBimpute=TRUE, # Use imputation - summaryMethod="TMP" # Tukey median polish summarization - ) - -base_comparison_result = groupComparison( - contrast.matrix = comparison, - data=base_summarized) - -head(base_comparison_result$ComparisonResult) -``` - -We can see that `MSstats` does not identify all the proteins as differentially abundant, even though there was a true log$_2$ fold change of -1. We can compare the fold change and standard error estimates between `MSstats+` and base `MSstats` to see why the p-values are different. - -```{r} -comparison_result$ComparisonResult$Model = "MSstats+" -base_comparison_result$ComparisonResult$Model = "MSstats" - -merged_results = rbindlist(list( - comparison_result$ComparisonResult, - base_comparison_result$ComparisonResult -)) - -ggplot(data=merged_results) + - geom_col(aes(x=Protein, y=abs(log2FC), fill=Model), position="dodge") + - geom_hline(yintercept=1, linetype="dashed", color="black", linewidth=1) + - theme_bw(base_size =16) + - labs(title="Log fold change comparison", - x="MSstats+", - y="Base MSstats") -``` - -The fold changes are both near the ground truth and `MSstats+` does not clearly outperform base `MSstats`. - -```{r} -ggplot(data=merged_results) + - geom_col(aes(x=Protein, y=SE, fill=Model), position="dodge") + - theme_bw(base_size =16) + - labs(title="Standard error comparison", - x="MSstats+", - y="Base MSstats") -``` - -We can see the advantage of `MSstats+` in the standard error estimates. `MSstats+` downweights poorly quantified measurements and runs, which can lead to much better estimates of the true standard error. In this example `MSstats+` downweighted the runs which added a large amount of uncertainty, leading to it identifying all proteins as differentially abundant. - From e019a9a3c5cff10935ca546f649773fd69190628 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Tue, 5 Aug 2025 08:59:53 -0400 Subject: [PATCH 16/20] MSstatsBig example --- vignettes/MSstatsPlus.Rmd | 88 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/vignettes/MSstatsPlus.Rmd b/vignettes/MSstatsPlus.Rmd index 3f8f0230..bdb7ccd1 100644 --- a/vignettes/MSstatsPlus.Rmd +++ b/vignettes/MSstatsPlus.Rmd @@ -375,3 +375,91 @@ ggplot(data=merged_results) + We can see the advantage of `MSstats+` in the standard error estimates. `MSstats+` downweights poorly quantified measurements and runs, which can lead to much better estimates of the true standard error. In this example `MSstats+` downweighted the runs which added a large amount of uncertainty, leading to it identifying all proteins as differentially abundant. +## MSstatsBig + +MSstats+ can also be used in conjunction with MSstatsBig to analyze large DIA +datasets. The workflow leverages the MSstatsBig converter and an external +execution of the MSstats+ isolation forest. An example of how to do this is +shown below. + +This section of the vignette does not run automatically and must be executed +manually. + +```{r eval=FALSE, include=TRUE} +library(MSstatsBig) + +# Define file path to data +spectronaut_raw = system.file( + "tinytest/raw_data/Spectronaut/spectronaut_quality_input.csv", + package = "MSstatsConvert") + +# Run MSstatsBig converter +converted_data = bigSpectronauttoMSstatsFormat( + input_file=spectronaut_raw, + output_file_name="output_file.csv", + intensity = "F.PeakHeight", + filter_by_excluded = TRUE, + filter_by_identified = FALSE, + filter_by_qvalue = TRUE, + qvalue_cutoff = 0.01, + filter_unique_peptides = TRUE, + aggregate_psms = TRUE, + filter_few_obs = FALSE, + remove_annotation = FALSE, + calculateAnomalyScores=TRUE, + anomalyModelFeatures=c("FG.ShapeQualityScore (MS1)", + "FG.ShapeQualityScore (MS2)", + "EG.ApexRT"), + backend="arrow") +converted_data = dplyr::collect(converted_data) + +# Prepare info for anomaly model +# Get temporal ordering from input file (can also be done externally) +temporal = fread(spectronaut_raw)[ + , .SD[1], by = .(`R.Run Date (Formatted)`, R.FileName)] + +temporal = temporal[, Run.TimeParsed := as.POSIXct( + `R.Run Date (Formatted)`, format = "%m/%d/%Y %I:%M:%S %p")][ + order(Run.TimeParsed)][ + , !"Run.TimeParsed"] + +temporal$Order = 1:nrow(temporal) +temporal$Run = temporal$R.FileName +temporal = temporal[, c("Run", "Order")] + +# Specify anomaly model parameters +anomalyModelFeatures=c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT") +anomalyModelFeatures = MSstatsConvert:::.standardizeColnames(anomalyModelFeatures) + +anomalyModelFeatureTemporal=c("mean_decrease", + "mean_decrease", + "dispersion_increase") + +n_trees=100 +max_depth="auto" +numberOfCores=1 + +# Run anomaly detection model +msstats_input = MSstatsConvert::MSstatsAnomalyScores( + as.data.table(converted_data), anomalyModelFeatures, + anomalyModelFeatureTemporal, .5, 100, temporal, n_trees, + max_depth, numberOfCores) +head(msstats_input) + +``` + +This code will run the MSstatsBig converter and then run the anomaly model. This +workflow leads to a much faster processing speed compared to using the standard +converter, and allows users to work with Spectronaut files which may be too +large to fit into memory. After running the anomaly model, the resulting data +can be input into the MSstats+ workflow reviewed previously. + +One note here is the temporal ordering. In this example, we extracted the +temporal run order directly from the Spectronaut file. However, in practice this +may not be appropriate for big data files. In cases where the data is large, +this ordering should be extracted externally and provided by the user, or the +user should sample the Spectronaut file rather than load it all into memory. + + From 3a184ca590eed5bbd4fb27dd4391c200227697f3 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 26 Aug 2025 16:58:44 -0400 Subject: [PATCH 17/20] test(anomaly_scores): Add unit tests for dataProcess and groupComparison for anomaly score feature --- R/dataProcess.R | 3 +- R/utils_checks.R | 12 ++- inst/tinytest/test_dataProcess.R | 89 ++++++++++++++++++- inst/tinytest/test_groupComparison.R | 45 +++++++++- .../test_utils_groupComparison_model.R | 18 ++++ 5 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 inst/tinytest/test_utils_groupComparison_model.R diff --git a/R/dataProcess.R b/R/dataProcess.R index 262e9991..5fab5b9c 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -136,7 +136,8 @@ dataProcess = function( list(method = featureSubset, n_top = n_top_feature, remove_uninformative = remove_uninformative_feature_outlier), list(method = summaryMethod, equal_var = equalFeatureVar), - list(symbol = censoredInt, MB = MBimpute)) + list(symbol = censoredInt, MB = MBimpute), + colnames(raw)) peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization) input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing) diff --git a/R/utils_checks.R b/R/utils_checks.R index 2af35863..1d5537ac 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -86,15 +86,23 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { #' @param feature_selection list with elements: remove_uninformative #' @param summarization list with elements: method. #' @param imputation list with elements: cutoff, symbol. +#' @param input_columns character vector of input columns #' @keywords internal .checkDataProcessParams = function(log_base, normalization_method, standards_names, feature_selection, - summarization, imputation) { + summarization, imputation, input_columns) { checkmate::assertChoice(log_base, c(2, 10), .var.name = "logTrans") checkmate::assertChoice(summarization$method, c("linear", "TMP"), - .var.name = "summaryMethod") + .var.name = "summaryMethod") getOption("MSstatsLog")("INFO", paste("Summary method:", summarization$method)) + if ("AnomalyScores" %in% input_columns) { + if (summarization$method != "linear") { + stop("AnomalyScores column detected in your input columns. ", + "Please set summaryMethod to 'linear' to use anomaly scores for protein summarization, ", + "or remove the AnomalyScores column from your input data.") + } + } checkmate::assertChoice(imputation$symbol, c("0", "NA"), null.ok = TRUE, .var.name = "censoredInt") getOption("MSstatsLog")("INFO", paste("censoredInt:", imputation$symbol)) diff --git a/inst/tinytest/test_dataProcess.R b/inst/tinytest/test_dataProcess.R index 3b2a8cf0..51b21ad5 100644 --- a/inst/tinytest/test_dataProcess.R +++ b/inst/tinytest/test_dataProcess.R @@ -24,4 +24,91 @@ msstats_input_fractions_techreps = data.table::fread( QuantDataTechRepsFractions = dataProcess(msstats_input_fractions_techreps, use_log_file = FALSE) expect_true(!is.null(QuantDataTechRepsFractions)) -expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0) \ No newline at end of file +expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0) + +# Test dataProcess with linear summary method and anomaly scores -------------- +msstats_input = data.table::data.table( + ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)), + PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5), + rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5), + rep("LowAbundancePeptide", 5)), + PrecursorCharge = rep(2, 25), + FragmentIon = rep("y3", 25), + ProductCharge = rep(1, 25), + IsotopeLabelType = rep("L", 25), + Condition = rep(c("A", "A", "A", "B", "B"), 5), + BioReplicate = rep(seq(1:5), 5), + Run = rep(paste0("Run", seq(1:5)), 5), + Intensity = c(1000, 1500, 2000, 2500, 3000, + 1100, 1600, 2100, 2600, 3100, + 1200, 1700, 2200, 2000, 1700, + 1300, 1800, 1200, 2800, 1800, + 100, 200, 300, 400, 500), + AnomalyScores = c(rep(0.03, 10), c(0.01,0.01,0.01,0.9,0.8), c(0.01,0.01,0.01,0.7,0.5), rep(0.10, 5)) +) + +# Must run with linear summarization +expect_error( + dataProcess(msstats_input, summaryMethod="TMP"), + pattern = "AnomalyScores column detected in your input columns.*set summaryMethod to 'linear'", + info = "Should error when AnomalyScores is present but summaryMethod is not 'linear'" +) + +# Process data with linear summarization +QuantDataLinearAnomaly = dataProcess(msstats_input, + normalization=FALSE, # no normalization + MBimpute=TRUE, # Use imputation + summaryMethod="linear" # Key MSstats+ parameter +) + +expect_true("Variance" %in% colnames(QuantDataLinearAnomaly$ProteinLevelData), + info = "Variance column should be present in ProteinLevelData") +protein_data = QuantDataLinearAnomaly$ProteinLevelData +expect_true(all(protein_data$Variance > 0, na.rm = TRUE), + info = "All variance values should be positive") +expect_true(all(is.finite(protein_data$Variance), na.rm = TRUE), + info = "All variance values should be finite") +variance_values = protein_data[protein_data$Protein == "Q9UFW8"]$Variance +expect_true(all(variance_values == variance_values[1]), + info = paste("Not all variance values are equal for Q9UFW8")) + +# Create comparison dataset with low anomaly scores for contrast +msstats_input_low_anomaly = data.table::data.table( + ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)), + PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5), + rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5), + rep("LowAbundancePeptide", 5)), + PrecursorCharge = rep(2, 25), + FragmentIon = rep("y3", 25), + ProductCharge = rep(1, 25), + IsotopeLabelType = rep("L", 25), + Condition = rep(c("A", "A", "A", "B", "B"), 5), + BioReplicate = rep(seq(1:5), 5), + Run = rep(paste0("Run", seq(1:5)), 5), + Intensity = c(1000, 1500, 2000, 2500, 3000, + 1100, 1600, 2100, 2600, 3100, + 1200, 1700, 2200, 2000, 1700, + 1300, 1800, 1200, 2800, 1800, + 100, 200, 300, 400, 500), + AnomalyScores = rep(0.01, 25) # All low anomaly scores +) + +QuantDataLowAnomaly = dataProcess(msstats_input_low_anomaly, + normalization=FALSE, + MBimpute=TRUE, + summaryMethod="linear") +high_anomaly_variance = protein_data$Variance +low_anomaly_variance = QuantDataLowAnomaly$ProteinLevelData$Variance +expect_true(mean(high_anomaly_variance, na.rm = TRUE) > mean(low_anomaly_variance, na.rm = TRUE), + info = "Mean variance should be higher for dataset with high anomaly scores") + + +# Verify that without AnomalyScores, different behavior occurs +msstats_input_no_anomaly = msstats_input[, !c("AnomalyScores")] +QuantDataNoAnomaly = dataProcess(msstats_input_no_anomaly, + normalization=FALSE, + MBimpute=TRUE, + summaryMethod="linear") +no_anomaly_variance = QuantDataNoAnomaly$ProteinLevelData$Variance +expect_false(identical(high_anomaly_variance, no_anomaly_variance), + info = "Variance should be different when AnomalyScores are present vs absent") \ No newline at end of file diff --git a/inst/tinytest/test_groupComparison.R b/inst/tinytest/test_groupComparison.R index fc3997a3..9437568c 100644 --- a/inst/tinytest/test_groupComparison.R +++ b/inst/tinytest/test_groupComparison.R @@ -17,4 +17,47 @@ testResultParallelComparison = groupComparison(contrast.matrix=comparison, numberOfCores = 2) expect_equal(nrow(testResultDefaultComparison$ComparisonResult), - nrow(testResultParallelComparison$ComparisonResult)) \ No newline at end of file + nrow(testResultParallelComparison$ComparisonResult)) + +# Test groupComparison with weights +protein_level_data_with_anomalies = data.table::data.table( + RUN = c( "1", "2", "3", "4", "5", "1", "2", "3", "4", "5" ), + Protein = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8" ), + LogIntensities = c(10.28, 10.39, 10.63, 11.26, 11.25, 10.03, 10.59, 11.00, 11.31, 11.57), + Variance = c( 0.0020, 0.0020, 0.15, 0.13, 0.0020, rep(0.00032, 5)), + originalRUN = c( "Run1", "Run2", "Run3", "Run4", "Run5", "Run1", "Run2", "Run3", "Run4", "Run5" ), + GROUP = c( "A", "A", "A", "B", "B", "A", "A", "A", "B", "B" ), + SUBJECT = c( 1, 2, 3, 4, 5, 1, 2, 3, 4, 5 ), + TotalGroupMeasurements = c( 9, 9, 9, 6, 6, 6, 6, 6, 4, 4 ), + NumMeasuredFeature = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ), + MissingPercentage = c( 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0, 0, 0, 0, 0 ), + more50missing = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ), + NumImputedFeature = c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) +) +protein_level_data_no_anomalies = protein_level_data_with_anomalies +protein_level_data_no_anomalies$Variance = NULL +QuantDataNoAnomaly = list(ProteinLevelData = protein_level_data_no_anomalies) +QuantDataWithAnomaly = list(ProteinLevelData = protein_level_data_with_anomalies) +model_no_anomaly = groupComparison("pairwise", QuantDataNoAnomaly, + use_log_file = FALSE) +model_with_anomaly = groupComparison("pairwise", QuantDataWithAnomaly, + use_log_file = FALSE) + +no_anomaly_Q9UFW8 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q9UFW8", ] +with_anomaly_Q9UFW8 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q9UFW8", ] + +expect_true(nrow(no_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_no_anomaly") +expect_true(nrow(with_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_with_anomaly") +expect_equal(no_anomaly_Q9UFW8$SE, with_anomaly_Q9UFW8$SE, + info = "SE values for Q9UFW8 should be the same") +expect_equal(no_anomaly_Q9UFW8$pvalue, with_anomaly_Q9UFW8$pvalue, + info = "pvalue values for Q9UFW8 should be the same") +no_anomaly_Q96S19 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q96S19", ] +with_anomaly_Q96S19 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q96S19", ] +expect_true(nrow(no_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_no_anomaly") +expect_true(nrow(with_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_with_anomaly") +expect_true(with_anomaly_Q96S19$SE < no_anomaly_Q96S19$SE, + info = "SE for Q96S19 should be lower in model_with_anomaly") +expect_true(with_anomaly_Q96S19$pvalue < no_anomaly_Q96S19$pvalue, + info = "pvalue for Q96S19 should be lower in model_with_anomaly") + diff --git a/inst/tinytest/test_utils_groupComparison_model.R b/inst/tinytest/test_utils_groupComparison_model.R new file mode 100644 index 00000000..4802f8ea --- /dev/null +++ b/inst/tinytest/test_utils_groupComparison_model.R @@ -0,0 +1,18 @@ +# Test for .fitModelForGroupComparison function +test_input_with_variance = data.frame( + ABUNDANCE = c(1.2, 1.5, 1.8, 2.1, 1.9, 2.3), + GROUP = factor(c("A", "A", "A", "B", "B", "B")), + SUBJECT = factor(c("S1", "S2", "S3", "S1", "S2", "S3")), + Variance = c(0.1, 0.15, 0.08, 0.12, 0.09, 0.11) +) +result = MSstats:::.fitModelForGroupComparison( + input = test_input_with_variance, + repeated = FALSE, + is_single_subject = FALSE, + has_tech_replicates = FALSE +) +expect_false(is.null(result$full_fit$weights), + "Fitted model should contain weights when Variance is provided") +expected_weights = 1/test_input_with_variance$Variance +expect_equal(result$full_fit$weights, expected_weights, + info = "Model weights should match 1/Variance") From c71c16c4aafedb8fef3f2b8fd57b3567cb53cc54 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 5 Sep 2025 15:57:32 -0400 Subject: [PATCH 18/20] add defaults for aft_iterations parameter to prevent Shiny from breaking --- R/dataProcess.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 5fab5b9c..71b86e79 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -204,7 +204,7 @@ dataProcess = function( #' MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol, remove50missing, equal_variance, numberOfCores = 1, - aft_iterations) { + aft_iterations = 90) { if (numberOfCores > 1) { protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) num_proteins = length(protein_indices) @@ -286,7 +286,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym #' head(summarized[[1]][[1]]) # run-level summary #' MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol, - remove50missing, equal_variance, aft_iterations) { + remove50missing, equal_variance, aft_iterations = 90) { protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) @@ -420,7 +420,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, impute, censored_symbol, remove50missing, - aft_iterations, + aft_iterations = 90, equal_variances = TRUE) { ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL @@ -524,7 +524,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, #' head(single_protein_summary[[1]]) #' MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, - remove50missing, aft_iterations) { + remove50missing, aft_iterations = 90) { newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL predicted = censored = NULL cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN", From c1403628766ed6787880e3baaa6a94ee8f1fc540 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Mon, 8 Sep 2025 11:58:30 -0400 Subject: [PATCH 19/20] Add more granular unit tests for data summarization subfunctions --- inst/tinytest/test_dataProcess.R | 98 +++++++++++++++++++++++- inst/tinytest/test_utils_summarization.R | 46 +++++++++++ man/MSstatsSummarizeSingleLinear.Rd | 2 +- man/MSstatsSummarizeSingleTMP.Rd | 2 +- man/MSstatsSummarizeWithMultipleCores.Rd | 2 +- man/MSstatsSummarizeWithSingleCore.Rd | 2 +- man/dot-checkDataProcessParams.Rd | 5 +- 7 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 inst/tinytest/test_utils_summarization.R diff --git a/inst/tinytest/test_dataProcess.R b/inst/tinytest/test_dataProcess.R index 51b21ad5..96b35c44 100644 --- a/inst/tinytest/test_dataProcess.R +++ b/inst/tinytest/test_dataProcess.R @@ -111,4 +111,100 @@ QuantDataNoAnomaly = dataProcess(msstats_input_no_anomaly, summaryMethod="linear") no_anomaly_variance = QuantDataNoAnomaly$ProteinLevelData$Variance expect_false(identical(high_anomaly_variance, no_anomaly_variance), - info = "Variance should be different when AnomalyScores are present vs absent") \ No newline at end of file + info = "Variance should be different when AnomalyScores are present vs absent") + + + +# Test MSstatsSummarizeSingleLinear function ------------------------------ + +single_protein = data.table::data.table( + PROTEIN = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19" ), + PEPTIDE = c( "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2" ), + TRANSITION = c( "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1" ), + FEATURE = c( "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1" ), + LABEL = c( "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L" ), + GROUP_ORIGINAL = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ), + SUBJECT_ORIGINAL = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5 ), + RUN = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ), + GROUP = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ), + SUBJECT = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ), + FRACTION = c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ), + INTENSITY = c( 1200, 1300, 100, 1700, 1800, 200, 2200, 1200, 300, 2000, 2800, 400, 1700, 1800, 500 ), + ANOMALYSCORES = c( 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.9, 0.7, 0.1, 0.8, 0.5, 0.1 ), + ABUNDANCE = c( 10.229, 10.344, 6.644, 10.731, 10.814, 7.644, 11.103, 10.229, 8.229, 10.966, 11.451, 8.644, 10.731, 10.814, 8.966 ), + originalRUN = c( "Run1", "Run1", "Run1", "Run2", "Run2", "Run2", "Run3", "Run3", "Run3", "Run4", "Run4", "Run4", "Run5", "Run5", "Run5" ), + censored = c( FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE ), + newABUNDANCE = c( 10.229, 10.344, NA, 10.731, 10.814, NA, 11.103, 10.229, NA, 10.966, 11.451, NA, 10.731, 10.814, NA ), + cen = c( 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0 ), + nonmissing = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ), + n_obs = c( 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0 ), + n_obs_run = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ), + total_features = c( 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ), + prop_features = c( 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667 ), + nonmissing_all = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ), + ABUNDANCE_cut = c( 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA ), + any_censored = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ) +) + +summarized = MSstats::MSstatsSummarizeSingleLinear( + single_protein, + impute = FALSE, + censored_symbol = "NA", + remove50missing = FALSE, + aft_iterations = 90, + equal_variances = TRUE +) + +protein_level_summary = summarized[[1]] +feature_level_summary = summarized[[2]] + +## Basic tests +expect_equal(nrow(protein_level_summary), 5) +expect_equal(ncol(protein_level_summary), 4) +expect_true(all(protein_level_summary$Protein == "Q96S19")) +expect_equal(as.numeric(protein_level_summary$RUN), 1:5) +expect_equal(nrow(feature_level_summary), 10) +expect_equal(ncol(feature_level_summary), 5) +expect_equal(length(unique(feature_level_summary$FEATURE)), 2) +expect_equal(length(unique(feature_level_summary$RUN)), 5) + +## Verify that variance is higher in runs 4-5 with high anomaly scores +variance_runs_1_3 = protein_level_summary[RUN %in% 1:3, Variance] +variance_runs_4_5 = protein_level_summary[RUN %in% 4:5, Variance] + +for (var_45 in variance_runs_4_5) { + for (var_13 in variance_runs_1_3) { + expect_true(var_45 > var_13, + info = paste("Variance in runs 4-5 should be > variance in runs 1-3:", + var_45, ">", var_13)) + } +} + +mean_variance_1_3 = mean(variance_runs_1_3) +mean_variance_4_5 = mean(variance_runs_4_5) + +expect_true(mean_variance_4_5 > 10 * mean_variance_1_3, + info = paste("Mean variance of runs 4-5 should be at least 10x higher:", + mean_variance_4_5, "vs", 10 * mean_variance_1_3)) + +## Verify that protein levels in runs 4-5 are closer to ASGLLLER than AFPLAEWQPSDVDQR +protein_runs_4_5 = protein_level_summary[RUN %in% 4:5] +afpla_runs_4_5 = feature_level_summary[FEATURE == "AFPLAEWQPSDVDQR_2_y3_1" & RUN %in% 4:5] +asgll_runs_4_5 = feature_level_summary[FEATURE == "ASGLLLER_2_y3_1" & RUN %in% 4:5] +for (i in 1:nrow(protein_runs_4_5)) { + run_num = protein_runs_4_5$RUN[i] + protein_logint = protein_runs_4_5$LogIntensities[i] + + afpla_abundance = afpla_runs_4_5[RUN == run_num, newABUNDANCE] + asgll_abundance = asgll_runs_4_5[RUN == run_num, newABUNDANCE] + + dist_to_afpla = abs(protein_logint - afpla_abundance) + dist_to_asgll = abs(protein_logint - asgll_abundance) + + expect_true(dist_to_asgll < dist_to_afpla, + info = paste("Run", run_num, ": Protein LogIntensity", protein_logint, + "should be closer to ASGLLLER", asgll_abundance, + "(dist =", round(dist_to_asgll, 4), ")", + "than to AFPLAEWQPSDVDQR", afpla_abundance, + "(dist =", round(dist_to_afpla, 4), ")")) +} diff --git a/inst/tinytest/test_utils_summarization.R b/inst/tinytest/test_utils_summarization.R new file mode 100644 index 00000000..05e254ea --- /dev/null +++ b/inst/tinytest/test_utils_summarization.R @@ -0,0 +1,46 @@ +single_protein <- data.table::data.table( + PROTEIN = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19" ), + PEPTIDE = c( "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2" ), + TRANSITION = c( "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1" ), + FEATURE = c( "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1" ), + LABEL = c( "L", "L", "L", "L", "L", "L", "L", "L", "L", "L" ), + GROUP_ORIGINAL = c( "A", "A", "A", "A", "A", "A", "B", "B", "B", "B" ), + SUBJECT_ORIGINAL = c( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 ), + RUN = c( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 ), + GROUP = c( "A", "A", "A", "A", "A", "A", "B", "B", "B", "B" ), + SUBJECT = c( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5 ), + FRACTION = c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ), + INTENSITY = c( 1200, 1300, 1700, 1800, 2200, 1200, 2000, 2800, 1700, 1800 ), + ANOMALYSCORES = c( 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.9, 0.7, 0.8, 0.5 ), + ABUNDANCE = c( 10.229, 10.344, 10.731, 10.814, 11.103, 10.229, 10.966, 11.451, 10.731, 10.814 ), + originalRUN = c( "Run1", "Run1", "Run2", "Run2", "Run3", "Run3", "Run4", "Run4", "Run5", "Run5" ), + censored = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ), + newABUNDANCE = c( 10.229, 10.344, 10.731, 10.814, 11.103, 10.229, 10.966, 11.451, 10.731, 10.814 ), + cen = c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ), + nonmissing = c( TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE ), + n_obs = c( 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 ), + n_obs_run = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ), + total_features = c( 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ), + prop_features = c( 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667 ), + nonmissing_all = c( TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE ), + ABUNDANCE_cut = c( 10.127, 10.127, 10.127, 10.127, 10.127, 10.127, 10.127, 10.127, 10.127, 10.127 ), + any_censored = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ), + weights = c( 100, 100, 100, 100, 100, 100, 1.111, 1.429, 1.25, 2 ) +) + +summarized = MSstats:::.fitLinearModel(single_protein, FALSE, FALSE, TRUE) + +expect_equal(summarized$weights, single_protein$weights, + info = "Weights from model should match input weights") + +single_protein$weights = NULL +summarized_no_weights = MSstats:::.fitLinearModel(single_protein, FALSE, FALSE, TRUE) +expect_true(is.null(summarized_no_weights$weights), + info = "Weights should be null when no weights are provided") + +feature_coefficient_with_weights = + summarized$coefficients[['FEATUREASGLLLER_2_y3_1']] +feature_coefficient_no_weights = + summarized_no_weights$coefficients[['FEATUREASGLLLER_2_y3_1']] +expect_false(feature_coefficient_with_weights == feature_coefficient_no_weights, + info = "Feature coefficients should differ when weights are used") diff --git a/man/MSstatsSummarizeSingleLinear.Rd b/man/MSstatsSummarizeSingleLinear.Rd index 86e19542..a1522d19 100644 --- a/man/MSstatsSummarizeSingleLinear.Rd +++ b/man/MSstatsSummarizeSingleLinear.Rd @@ -9,7 +9,7 @@ MSstatsSummarizeSingleLinear( impute, censored_symbol, remove50missing, - aft_iterations, + aft_iterations = 90, equal_variances = TRUE ) } diff --git a/man/MSstatsSummarizeSingleTMP.Rd b/man/MSstatsSummarizeSingleTMP.Rd index 7ec2c7c9..7b7ca525 100644 --- a/man/MSstatsSummarizeSingleTMP.Rd +++ b/man/MSstatsSummarizeSingleTMP.Rd @@ -9,7 +9,7 @@ MSstatsSummarizeSingleTMP( impute, censored_symbol, remove50missing, - aft_iterations + aft_iterations = 90 ) } \arguments{ diff --git a/man/MSstatsSummarizeWithMultipleCores.Rd b/man/MSstatsSummarizeWithMultipleCores.Rd index ef4b20c2..d1335ec6 100644 --- a/man/MSstatsSummarizeWithMultipleCores.Rd +++ b/man/MSstatsSummarizeWithMultipleCores.Rd @@ -12,7 +12,7 @@ MSstatsSummarizeWithMultipleCores( remove50missing, equal_variance, numberOfCores = 1, - aft_iterations + aft_iterations = 90 ) } \arguments{ diff --git a/man/MSstatsSummarizeWithSingleCore.Rd b/man/MSstatsSummarizeWithSingleCore.Rd index 6998458b..93d159c3 100644 --- a/man/MSstatsSummarizeWithSingleCore.Rd +++ b/man/MSstatsSummarizeWithSingleCore.Rd @@ -11,7 +11,7 @@ MSstatsSummarizeWithSingleCore( censored_symbol, remove50missing, equal_variance, - aft_iterations + aft_iterations = 90 ) } \arguments{ diff --git a/man/dot-checkDataProcessParams.Rd b/man/dot-checkDataProcessParams.Rd index e65dc6de..9b4b0ed7 100644 --- a/man/dot-checkDataProcessParams.Rd +++ b/man/dot-checkDataProcessParams.Rd @@ -10,7 +10,8 @@ standards_names, feature_selection, summarization, - imputation + imputation, + input_columns ) } \arguments{ @@ -24,6 +25,8 @@ \item{summarization}{list with elements: method.} \item{imputation}{list with elements: cutoff, symbol.} + +\item{input_columns}{character vector of input columns} } \description{ Check validity of parameters to dataProcess function From 5b972e7529330cbd0394168c2b22a8a5c2d55426 Mon Sep 17 00:00:00 2001 From: devonjkohler Date: Wed, 10 Sep 2025 15:26:26 -0400 Subject: [PATCH 20/20] minor text edits to msstatsplus vignette --- vignettes/MSstatsPlus.Rmd | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vignettes/MSstatsPlus.Rmd b/vignettes/MSstatsPlus.Rmd index bdb7ccd1..198c79bf 100644 --- a/vignettes/MSstatsPlus.Rmd +++ b/vignettes/MSstatsPlus.Rmd @@ -20,11 +20,11 @@ designed to incorporate the underlying spectral peak quality into differential analysis, going beyond peak intensity alone. It integrates the underlying spectral peaks using an isolation forest (iForest) for anomaly detection, which integrates all the peak quality metrics into a single score. This score is then -used as weights in protein-level summarization weighted linear regression. After +used a weight in protein-level summarization. After protein-level summarization, the estimates of underlying protein abundance and -the estimates associated variance are used as weights in a second weighted -linear model which is used to perform differential analysis. It can be used -for experiments with simple group comparison designs, or more complex designs +their associated variance are used as weights in a second weighted +linear model which is used to perform differential analysis. The models can be +applied to experiments with simple group comparison designs, or more complex designs (e.g. comparing more than two experimental conditions, or a repeated measure design, such as a time course). @@ -50,13 +50,13 @@ The main iForest algorithm for `MSstats+` is implemented in the `MSstatsConvert` package (a dependency of `MSstats`). The `MSstatsConvert` package contains individual converters for each spectral processing tool. Currently, `MSstats+` only has a dedicated converter for `Spectronaut`, however more converters are -in development. For integration with other tools, please reach out on the -`MSstats` Google Group: \url{https://groups.google.com/g/msstats}. +in development and can be implemented manually (see below), For integration with +other tools, please reach out on the `MSstats` Google Group: \url{https://groups.google.com/g/msstats}. ### Load data First we will load the Spectronaut report of identified and quantified peaks. It -is necessary that users output all the spectral peak quality data along with +is necessary that users output the spectral peak quality data along with these peaks. These spectral peak quality metrics must be at least at the precursor-level (i.e., the column headers must not start with `F.`). There are multiple options for columns that can be extracted, some @@ -64,7 +64,7 @@ examples include: `FG.ShapeQualityScore (MS1)`, `FG.ShapeQualityScore (MS2)`, and `EG.DeltaRT`. Users can include as many quality metrics as they want, however we recommend using only quality metrics that are key indicators of the quality of a spectral peak. Including too many quality metrics may lead to -overfitting of the iForest model. We include functionality to assess the +the iForest model fitting noise. We include additional functionality to assess the performance of the model (discussed below). An example output from Spectronaut is included in the @@ -134,7 +134,8 @@ head(msstats_input) Key parameters for `MSstats+`: - `calculateAnomalyScores`: Set to `TRUE` to use the iForest algorithm to calculate anomaly scores for each spectral peak. -- `anomalyModelFeatures`: A character vector of the column names of the quality metrics to be used in the iForest model. These must be at the precursor-level +- `anomalyModelFeatures`: A character vector of the column names of the quality + metrics to be used in the iForest model. These must be at the precursor-level or higher (i.e., the column headers must not start with `F.`). - `anomalyModelFeatureTemporal`: A character vector of the same length as `anomalyModelFeatures` indicating the temporal direction of each quality @@ -462,4 +463,3 @@ may not be appropriate for big data files. In cases where the data is large, this ordering should be extracted externally and provided by the user, or the user should sample the Spectronaut file rather than load it all into memory. -