diff --git a/R/dataProcess.R b/R/dataProcess.R index 74e2c808..a922fb5d 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -210,7 +210,12 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym remove50missing, equal_variance, numberOfCores = 1, aft_iterations = 90) { if (numberOfCores > 1) { - protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) + is_labeled_reference = "is_labeled_ref" %in% colnames(input) && any(input$is_labeled_ref, na.rm = TRUE) + if (is_labeled_reference) { + protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) + } else { + protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN, input$LABEL)) + } num_proteins = length(protein_indices) function_environment = environment() cl = parallel::makeCluster(numberOfCores) @@ -292,9 +297,14 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym #' MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol, remove50missing, equal_variance, aft_iterations = 90) { - - - protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) + + + is_labeled_reference = "is_labeled_ref" %in% colnames(input) && any(input$is_labeled_ref, na.rm = TRUE) + if (is_labeled_reference) { + protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN)) + } else { + protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN, input$LABEL)) + } num_proteins = length(protein_indices) summarized_results = vector("list", num_proteins) if (method == "TMP") { @@ -356,57 +366,59 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol #' head(single_protein_summary[[1]]) #' #' -MSstatsSummarizeSingleLinear = function(single_protein, +MSstatsSummarizeSingleLinear = function(single_protein, impute, - censored_symbol, - remove50missing, + censored_symbol, + remove50missing, aft_iterations = 90, equal_variances = TRUE) { ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL - + cols = intersect( colnames(single_protein), c("newABUNDANCE", "cen", "RUN", "FEATURE", "ref_covariate") ) - + + is_labeled_reference = "is_labeled_ref" %in% colnames(single_protein) && + any(single_protein$is_labeled_ref, na.rm = TRUE) + single_protein = single_protein[ (n_obs > 1 & !is.na(n_obs)) & (n_obs_run > 0 & !is.na(n_obs_run)) ] - + if (nrow(single_protein) == 0) { return(list(NULL, NULL)) } - + single_protein[, RUN := factor(RUN)] single_protein[, FEATURE := factor(FEATURE)] - + if (impute & any(single_protein[["censored"]])) { - survival_fit = .fitSurvival( - single_protein[LABEL == "L", cols, with = FALSE], - aft_iterations - ) + fit_data = if (is_labeled_reference) { + single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + } else { + single_protein[, cols, with = FALSE] + } + survival_fit = .fitSurvival(fit_data, aft_iterations) sigma2 = survival_fit$scale^2 - + single_protein[, c("predicted", "imputation_var") := { pred = predict(survival_fit, newdata = .SD, se.fit = TRUE) list(pred$fit, pred$se.fit^2 + sigma2) }] - - single_protein[, predicted := ifelse( - censored & (LABEL == "L"), - predicted, - NA - )] - single_protein[, newABUNDANCE := ifelse( - censored & LABEL == "L", - predicted, - newABUNDANCE - )] - - survival = single_protein[, c(cols, "predicted"), with = FALSE] + + if (is_labeled_reference) { + single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] + single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + } else { + single_protein[, predicted := ifelse(censored, predicted, NA)] + single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + } + + survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { - survival = single_protein[, cols, with = FALSE] + survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE] survival[, predicted := NA] } @@ -432,14 +444,14 @@ MSstatsSummarizeSingleLinear = function(single_protein, is_single_feature = .checkSingleFeature(single_protein) if (is_single_feature) { - result = single_protein[LABEL == "L", - .(LogIntensities = mean(newABUNDANCE)), - by = RUN] + # Todo: enable SRM linear summarization + if (is_labeled_reference) single_protein = single_protein[LABEL == "L"] + result = single_protein[, .(LogIntensities = mean(newABUNDANCE)), by = RUN] result[, Protein := unique(single_protein$PROTEIN)] + result[, LABEL := if (is_labeled_reference) "L" else unique(single_protein$LABEL)] result[, Variance := NA_real_] - setcolorder(result, c("Protein", "RUN", "LogIntensities", - "Variance")) - + setcolorder(result, c("Protein", "RUN", "LogIntensities", "Variance")) + return(list(result, survival)) } else { counts = xtabs( @@ -447,14 +459,14 @@ MSstatsSummarizeSingleLinear = function(single_protein, data = unique(single_protein[, .(FEATURE, RUN)]) ) counts = as.matrix(counts) - + fit = try( .fitLinearModel(single_protein, is_single_feature, is_labeled = label, equal_variances), silent = TRUE ) - + if (inherits(fit, "try-error")) { msg = paste("*** error : can't fit the model for", unique(single_protein$PROTEIN)) @@ -464,12 +476,13 @@ MSstatsSummarizeSingleLinear = function(single_protein, } else { cf = summary(fit)$coefficients[, 1] cov_mat = vcov(fit) - - result = unique(single_protein[, .(Protein = PROTEIN, - RUN = RUN)]) + + result = unique(single_protein[, .(Protein = PROTEIN, RUN = RUN)]) extracted_values = get_linear_summary(single_protein, cf, counts, label, cov_mat) result = cbind(result, extracted_values) + # Todo: enable SRM linear summarization + result[, LABEL := if (is_labeled_reference) "L" else unique(single_protein$LABEL)] } return(list(result, survival)) @@ -508,12 +521,14 @@ MSstatsSummarizeSingleLinear = function(single_protein, #' impute, cens, FALSE, 100) #' head(single_protein_summary[[1]]) #' -MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, +MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, remove50missing, aft_iterations = 90) { newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL predicted = censored = NULL cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN", "FEATURE", "ref_covariate")) + is_labeled_reference = "is_labeled_ref" %in% colnames(single_protein) && + any(single_protein$is_labeled_ref, na.rm = TRUE) single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) & (n_obs_run > 0 & !is.na(n_obs_run))] if (nrow(single_protein) == 0) { @@ -522,43 +537,50 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, single_protein[, RUN := factor(RUN)] single_protein[, FEATURE := factor(FEATURE)] if (impute & any(single_protein[["censored"]])) { - + # Flag to track convergence warning converged = TRUE - + + fit_data = if (is_labeled_reference) { + single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + } else { + single_protein[, cols, with = FALSE] + } + # Try to fit survival model and catch convergence warnings survival_fit = withCallingHandlers({ - .fitSurvival(single_protein[LABEL == "L", cols, with = FALSE], - aft_iterations) + .fitSurvival(fit_data, aft_iterations) }, warning = function(w) { if (grepl("converge", conditionMessage(w), ignore.case = TRUE)) { message("Convergence warning caught: ", conditionMessage(w)) converged <<- FALSE } }) - + if (converged) { single_protein[, predicted := predict(survival_fit, newdata = .SD)] } else { single_protein[, predicted := NA_real_] } - - single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L", - predicted, newABUNDANCE)] - survival = single_protein[, c(cols, "predicted"), with = FALSE] + + if (is_labeled_reference) { + single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] + single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + } else { + single_protein[, predicted := ifelse(censored, predicted, NA)] + single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + } + survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { - survival = single_protein[, cols, with = FALSE] + survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE] survival[, predicted := NA] } - + single_protein = .isSummarizable(single_protein, remove50missing) if (is.null(single_protein)) { return(list(NULL, NULL)) } else { single_protein = single_protein[!is.na(newABUNDANCE), ] - is_labeled_reference = "is_labeled_ref" %in% colnames(single_protein) && - any(single_protein$is_labeled_ref, na.rm = TRUE) result = .runTukey(single_protein, is_labeled_reference, censored_symbol, remove50missing) } diff --git a/R/utils_output.R b/R/utils_output.R index 50ef3263..7b748482 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -40,9 +40,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed, input = .finalizeInput(input, summarized, method, impute, censored_symbol) summarized = lapply(summarized, function(x) x[[1]]) - summarized = data.table::rbindlist(summarized) + summarized = data.table::rbindlist(summarized, fill = TRUE) if (inherits(summarized, "try-error")) { - msg = paste("*** error : can't summarize per subplot with ", + msg = paste("*** error : can't summarize per subplot with ", method, ".") getOption("MSstatsLog")("ERROR", msg) getOption("MSstatsMsg")("ERROR", msg) @@ -50,24 +50,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed, rqmodelqc = NULL workpred = NULL } else { - input[LABEL == "L", TotalGroupMeasurements := uniqueN(.SD), - by = c("PROTEIN", "GROUP"), + input[, TotalGroupMeasurements := uniqueN(.SD), + by = c("PROTEIN", "GROUP", "LABEL"), .SDcols = c("FEATURE", "originalRUN")] cols = intersect(c("PROTEIN", "originalRUN", "RUN", "GROUP", - "GROUP_ORIGINAL", "SUBJECT_ORIGINAL", + "GROUP_ORIGINAL", "SUBJECT_ORIGINAL", "LABEL", "TotalGroupMeasurements", - "NumMeasuredFeature", "MissingPercentage", + "NumMeasuredFeature", "MissingPercentage", "more50missing", "NumImputedFeature"), colnames(input)) - merge_col = ifelse(is.element("RUN", colnames(summarized)), + merge_col = ifelse(is.element("RUN", colnames(summarized)), "RUN", "SUBJECT_ORIGINAL") - lab = unique(input[LABEL == "L", cols, with = FALSE]) - if (nlevels(input$LABEL) > 1) { - lab = lab[GROUP != 0] - } + lab = unique(input[, cols, with = FALSE]) lab = lab[, colnames(lab) != "GROUP", with = FALSE] - rqall = merge(summarized, lab, by.x = c(merge_col, "Protein"), - by.y = c(merge_col, "PROTEIN")) + rqall = merge(summarized, lab, by.x = c(merge_col, "Protein", "LABEL"), + by.y = c(merge_col, "PROTEIN", "LABEL")) data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), c("GROUP", "SUBJECT"), skip_absent = TRUE) @@ -83,8 +80,8 @@ MSstatsSummarizationOutput = function(input, summarized, processed, output_cols = intersect(c("PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE", "LABEL", "GROUP", "RUN", "SUBJECT", "FRACTION", "originalRUN", "censored", "INTENSITY", "ABUNDANCE", - "newABUNDANCE", "predicted", "feature_quality", - "is_outlier", "remove"), colnames(input)) + "newABUNDANCE", "predicted", "feature_quality", + "is_outlier", "remove", "is_labeled_ref"), colnames(input)) input = input[, output_cols, with = FALSE] if (is.element("remove", colnames(processed))) { @@ -126,31 +123,31 @@ MSstatsSummarizationOutput = function(input, summarized, processed, INTENSITY = newABUNDANCE = NumImputedFeature = NULL survival_predictions = lapply(summarized, function(x) x[[2]]) - predicted_survival = data.table::rbindlist(survival_predictions) + predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE) if (impute) { - cols = intersect(colnames(input), c("newABUNDANCE", + cols = intersect(colnames(input), c("newABUNDANCE", "cen", "RUN", - "FEATURE", "ref_covariate")) - input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE], + "FEATURE", "ref_covariate", "LABEL")) + input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE], predicted_survival, by = setdiff(cols, "newABUNDANCE"), all.x = TRUE) } input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)] - input[, NumMeasuredFeature := sum(NonMissingStats), - by = c("PROTEIN", "RUN")] + input[, NumMeasuredFeature := sum(NonMissingStats), + by = c("PROTEIN", "RUN", "LABEL")] input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)] input[, more50missing := MissingPercentage >= 0.5] if (!is.null(censored_symbol)) { if (is.element("censored", colnames(input))) { - input[, nonmissing_orig := LABEL == "L" & !censored] + input[, nonmissing_orig := !censored] } else { - input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)] + input[, nonmissing_orig := !is.na(INTENSITY)] } input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] if (impute) { - input[, NumImputedFeature := sum(LABEL == "L" & !nonmissing_orig), - by = c("PROTEIN", "RUN")] + input[, NumImputedFeature := sum(!nonmissing_orig), + by = c("PROTEIN", "RUN", "LABEL")] } else { input[, NumImputedFeature := 0] } @@ -168,15 +165,15 @@ MSstatsSummarizationOutput = function(input, summarized, processed, censored = INTENSITY = newABUNDANCE = NumImputedFeature = NULL input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)] - input[, NumMeasuredFeature := sum(NonMissingStats), - by = c("PROTEIN", "RUN")] + input[, NumMeasuredFeature := sum(NonMissingStats), + by = c("PROTEIN", "RUN", "LABEL")] input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)] input[, more50missing := MissingPercentage >= 0.5] if (!is.null(censored_symbol)) { if (is.element("censored", colnames(input))) { - input[, nonmissing_orig := LABEL == "L" & !censored] + input[, nonmissing_orig := !censored] } else { - input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)] + input[, nonmissing_orig := !is.na(INTENSITY)] } input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] input[, NumImputedFeature := 0] diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 395a5036..3b762ae7 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -62,21 +62,25 @@ #' Fit Tukey median polish #' @param input data.table with data for a single protein -#' @param is_labeled logical, if TRUE, data is coming from an SRM experiment +#' @param is_labeled_reference logical, if TRUE, H channel is used as a +#' normalization reference (SRM experiment): L abundances are adjusted by +#' subtracting the H value and adding back the H median, and only L results +#' are returned. If FALSE (e.g. protein turnover), each label is summarized +#' independently and results for all labels are returned. #' @inheritParams MSstatsSummarizeWithSingleCore #' @return data.table #' @keywords internal -.runTukey = function(input, is_labeled, censored_symbol, remove50missing) { +.runTukey = function(input, is_labeled_reference, censored_symbol, remove50missing) { Protein = RUN = newABUNDANCE = NULL - + if (nlevels(input$FEATURE) > 1) { - tmp_result = .fitTukey(input) - } else { - if (is_labeled) { + tmp_result = .fitTukey(input, is_labeled_reference) + } else { + if (is_labeled_reference) { tmp_result = .adjustLRuns(input, TRUE) + tmp_result[, LABEL := "L"] } else { - tmp_result = input[input$LABEL == "L", - list(RUN, LogIntensities = newABUNDANCE)] + tmp_result = input[, list(LABEL, RUN, LogIntensities = newABUNDANCE)] } } tmp_result[, Protein := unique(input$PROTEIN)] @@ -88,20 +92,22 @@ #' @inheritParams .runTukey #' @return data.table #' @keywords internal -.fitTukey = function(input) { +.fitTukey = function(input, is_labeled_reference) { LABEL = RUN = newABUNDANCE = NULL - + features = as.character(unique(input$FEATURE)) wide = data.table::dcast(LABEL + RUN ~ FEATURE, data = input, value.var = "newABUNDANCE", keep = TRUE) tmp_fitted = median_polish_summary(as.matrix(wide[, features, with = FALSE])) wide[, newABUNDANCE := tmp_fitted] tmp_result = wide[, list(LABEL, RUN, newABUNDANCE)] - - if (data.table::uniqueN(input$LABEL) == 2) { + + if (is_labeled_reference) { tmp_result = .adjustLRuns(tmp_result) + tmp_result[, list(LABEL = "L", RUN, LogIntensities = newABUNDANCE)] + } else { + tmp_result[, list(LABEL, RUN, LogIntensities = newABUNDANCE)] } - tmp_result[, list(RUN, LogIntensities = newABUNDANCE)] } @@ -133,13 +139,9 @@ #' @keywords internal .getNonMissingFilterStats = function(input, censored_symbol) { if (!is.null(censored_symbol)) { - if (censored_symbol == "NA") { - nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored - } else { - nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored - } + nonmissing_filter = !is.na(input$newABUNDANCE) & !input$censored } else { - nonmissing_filter = input$LABEL == "L" & !is.na(input$INTENSITY) + nonmissing_filter = !is.na(input$INTENSITY) } nonmissing_filter = nonmissing_filter & input$n_obs_run > 0 & input$n_obs > 1 nonmissing_filter diff --git a/R/utils_summarization_prepare.R b/R/utils_summarization_prepare.R index b1325c82..e07165cd 100644 --- a/R/utils_summarization_prepare.R +++ b/R/utils_summarization_prepare.R @@ -50,7 +50,7 @@ MSstatsPrepareForSummarization = function(input, method, impute, censored_symbol getOption("MSstatsMsg")("INFO", msg) } - input = .prepareSummary(input, method, impute, censored_symbol) + input = .prepareSummary(input, impute, censored_symbol, add_ref_covariate) input[, PROTEIN := factor(PROTEIN)] input } @@ -101,52 +101,22 @@ getProcessed = function(input) { #' Prepare feature-level data for summarization #' @param input data.table -#' @param method "TMP" / "linear" #' @param impute logical #' @param censored_symbol "0"/"NA" +#' @param is_labeled_reference logical, if TRUE the H channel is a normalization +#' reference (SRM) and grouping keys do not include LABEL; if FALSE (e.g. +#' protein turnover) LABEL is added to grouping keys so each label is +#' processed independently. #' @return data.table #' @keywords internal -.prepareSummary = function(input, method, impute, censored_symbol) { - # if (method == "TMP") { - input = .prepareTMP(input, impute, censored_symbol) - # } else { - # input = .prepareLinear(input, FALSE, censored_symbol) - # } - input -} - - -#' Prepare feature-level data for linear summarization -#' @inheritParams .prepareSummary -#' @return data.table -#' @keywords internal -.prepareLinear = function(input, impute, censored_symbol) { - newABUNDANCE = ABUNDANCE = nonmissing = n_obs = n_obs_run = NULL - total_features = FEATURE = prop_features = NULL - - input[, newABUNDANCE := ABUNDANCE] - input[, nonmissing := .getNonMissingFilter(.SD, impute, censored_symbol)] - input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")] - # remove feature with 1 measurement - input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)] - input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")] - - input[, total_features := uniqueN(FEATURE), by = "PROTEIN"] - input[, prop_features := sum(nonmissing) / total_features, - by = c("PROTEIN", "RUN")] - input -} - - -#' Prepare feature-level data for TMP summarization -#' @inheritParams .prepareSummary -#' @return data.table -#' @keywords internal -.prepareTMP = function(input, impute, censored_symbol) { +.prepareSummary = function(input, impute, censored_symbol, + is_labeled_reference = FALSE) { censored = feature_quality = newABUNDANCE = cen = nonmissing = n_obs = NULL n_obs_run = total_features = FEATURE = prop_features = NULL remove50missing = ABUNDANCE = NULL + label_by = if (is_labeled_reference) character(0) else "LABEL" + if (impute & !is.null(censored_symbol)) { if (is.element("feature_quality", colnames(input))) { input[, censored := ifelse(feature_quality == "Informative", @@ -163,19 +133,19 @@ getProcessed = function(input) { } input[, nonmissing := .getNonMissingFilter(input, impute, censored_symbol)] - input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")] - input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)] - input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")] + input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE", label_by)] + input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)] + input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN", label_by)] - input[, total_features := uniqueN(FEATURE), by = "PROTEIN"] + input[, total_features := uniqueN(FEATURE), by = c("PROTEIN", label_by)] input[, prop_features := sum(nonmissing) / total_features, - by = c("PROTEIN", "RUN")] + by = c("PROTEIN", "RUN", label_by)] if (is.element("cen", colnames(input))) { if (any(input[["cen"]] == 0)) { .setCensoredByThreshold(input, censored_symbol, remove50missing) } } - + input } diff --git a/inst/tinytest/processed_data/quant_data_dda.rds b/inst/tinytest/processed_data/quant_data_dda.rds new file mode 100644 index 00000000..0653a27f Binary files /dev/null and b/inst/tinytest/processed_data/quant_data_dda.rds differ diff --git a/inst/tinytest/test_dataProcess.R b/inst/tinytest/test_dataProcess.R index a105d181..e0113862 100644 --- a/inst/tinytest/test_dataProcess.R +++ b/inst/tinytest/test_dataProcess.R @@ -17,18 +17,62 @@ expect_equal(nrow(QuantDataDefaultLinear$FeatureLevelData), # SRMRawData is a label-based experiment: heavy ("H") rows must be preserved # in FeatureLevelData after dataProcess + +# Helper for near-equality comparison of data.tables with numeric tolerance +expect_dt_equal <- function(dt1, dt2, cols, tol = 1e-6, info = "") { + # Check non-numeric columns exactly + num_cols <- cols[sapply(dt1[, ..cols], is.numeric)] + key_cols <- cols[!cols %in% num_cols] + + # Sort both tables the same way before comparing + if (length(key_cols) > 0) { + data.table::setkeyv(dt1, key_cols) + data.table::setkeyv(dt2, key_cols) + } + + # Check row count + if (nrow(dt1) != nrow(dt2)) { + return(tinytest::expect_true(FALSE, info = paste(info, "(row count mismatch)"))) + } + + # Non-numeric exact match + if (length(key_cols) > 0) { + tinytest::expect_true( + data.table::fsetequal(dt1[, ..key_cols], dt2[, ..key_cols]), + info = paste(info, "(non-numeric columns)") + ) + } + + # Numeric approximate match + for (col in num_cols) { + tinytest::expect_true( + all(abs(dt1[[col]] - dt2[[col]]) < tol, na.rm = TRUE) && + identical(is.na(dt1[[col]]), is.na(dt2[[col]])), + info = paste(info, sprintf("(column: %s)", col)) + ) + } +} + quant_data_srm = readRDS( system.file("tinytest/processed_data/quant_data_srm.rds", package = "MSstats") ) + +dt1 <- data.table::as.data.table(quant_data_srm$ProteinLevelData) +dt2 <- data.table::as.data.table(QuantDataDefault$ProteinLevelData) +cols <- sort(intersect(names(dt1), names(dt2))) expect_true( - identical(QuantDataDefault, quant_data_srm), - info = "dataProcess output for SRMRawData should be identical to previously saved output" + data.table::fsetequal(dt1[, ..cols], dt2[, ..cols]), + info = "dataProcess ProteinLevelData for SRMRawData should be identical to previously saved output" ) +dt1 <- data.table::as.data.table(quant_data_srm$FeatureLevelData) +dt2 <- data.table::as.data.table(QuantDataDefault$FeatureLevelData) +cols <- sort(intersect(names(dt1), names(dt2))) expect_true( - "H" %in% QuantDataDefault$FeatureLevelData$LABEL, - info = "FeatureLevelData should contain heavy-label (H) rows for label-based SRM data" + data.table::fsetequal(dt1[, ..cols], dt2[, ..cols]), + info = "dataProcess FeatureLevelData for SRMRawData should be identical to previously saved output" ) + expect_true( "L" %in% QuantDataDefault$FeatureLevelData$LABEL, info = "SRMRawData FeatureLevelData must contain L rows" @@ -47,6 +91,28 @@ expect_true( ) +# DDARawData Regression Testing +quant_data_dda = readRDS( + system.file("tinytest/processed_data/quant_data_dda.rds", + package = "MSstats") +) +QuantDataDefaultDDA = dataProcess(DDARawData, use_log_file = FALSE) + +dt1 <- data.table::as.data.table(quant_data_dda$ProteinLevelData) +dt2 <- data.table::as.data.table(QuantDataDefaultDDA$ProteinLevelData) +cols <- sort(intersect(names(dt1), names(dt2))) +expect_dt_equal(dt1[, ..cols], dt2[, ..cols], cols, + info = "dataProcess ProteinLevelData for DDARawData should be identical to previously saved output" +) + +dt1 <- data.table::as.data.table(quant_data_dda$FeatureLevelData) +dt2 <- data.table::as.data.table(QuantDataDefaultDDA$FeatureLevelData) +cols <- sort(intersect(names(dt1), names(dt2))) +expect_dt_equal(dt1[, ..cols], dt2[, ..cols], cols, + info = "dataProcess FeatureLevelData for DDARawData should be identical to previously saved output" +) + + # Test dataProcess with technical replicates & fractions ------------------ msstats_input_fractions_techreps = data.table::fread( system.file("tinytest/processed_data/input_techreps_fractions.csv", @@ -191,11 +257,13 @@ feature_level_summary = summarized[[2]] ## Basic tests expect_equal(nrow(protein_level_summary), 5) -expect_equal(ncol(protein_level_summary), 4) +expect_equal(ncol(protein_level_summary), 5) # Protein, RUN, LogIntensities, Variance, LABEL +expect_true("LABEL" %in% colnames(protein_level_summary), + info = "MSstatsSummarizeSingleLinear: result must include LABEL column") expect_true(all(protein_level_summary$Protein == "Q96S19")) expect_equal(as.numeric(protein_level_summary$RUN), 1:5) expect_equal(nrow(feature_level_summary), 10) -expect_equal(ncol(feature_level_summary), 5) +expect_equal(ncol(feature_level_summary), 6) # newABUNDANCE, cen, RUN, FEATURE, LABEL, predicted expect_equal(length(unique(feature_level_summary$FEATURE)), 2) expect_equal(length(unique(feature_level_summary$RUN)), 5) @@ -278,3 +346,80 @@ expect_equal( "not the raw L values (14, 16) that the unlabeled path would return" ) ) + +# MSstatsSummarizeSingleTMP: SRM imputation — H rows must NOT be imputed ------ +# For SRM experiments, H is the normalization reference and must never be +# imputed. Only censored L rows (is_labeled_ref=FALSE) should receive a +# predicted value from the survival model. + +make_srm_impute_input <- function() { + runs <- paste0("R", 1:4) + levels_rc <- c("0", runs) + f1 <- data.table::data.table( + PROTEIN = "P1", + FEATURE = "F1", + LABEL = c("H","H","H","H", "L","L","L","L"), + RUN = c(runs, runs), + # F1-H-R1 censored (H reference — must NOT be imputed) + # F1-L-R2 censored (light peptide — MUST be imputed) + newABUNDANCE = c(NA, 10.5, 11.0, 11.5, 14.0, NA, 15.0, 15.5), + censored = c(TRUE, FALSE,FALSE,FALSE, FALSE,TRUE, FALSE,FALSE), + cen = c(0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L), + is_labeled_ref = c(TRUE,TRUE,TRUE,TRUE, FALSE,FALSE,FALSE,FALSE) + ) + f2 <- data.table::data.table( + PROTEIN = "P1", + FEATURE = "F2", + LABEL = c("H","H","H","H", "L","L","L","L"), + RUN = c(runs, runs), + newABUNDANCE = c(10.0,10.5,11.0,11.5, 14.0,14.5,15.0,15.5), + censored = rep(FALSE, 8), + cen = rep(1L, 8), + is_labeled_ref = c(TRUE,TRUE,TRUE,TRUE, FALSE,FALSE,FALSE,FALSE) + ) + dt <- data.table::rbindlist(list(f1, f2)) + dt[, ref_covariate := factor( + ifelse(is_labeled_ref == FALSE, as.character(RUN), "0"), + levels = levels_rc + )] + dt[, FEATURE := factor(FEATURE)] + dt[, RUN := factor(RUN)] + dt[, n_obs := 4L] + dt[, n_obs_run := 2L] + dt[, ANOMALYSCORES := NA_real_] + dt +} + +result_srm_imp <- MSstatsSummarizeSingleTMP( + make_srm_impute_input(), + impute = TRUE, + censored_symbol = "NA", + remove50missing = FALSE, + aft_iterations = 90 +) + +survival_srm <- result_srm_imp[[2]] + +# Censored H reference row: predicted must remain NA (not imputed) +h_cens_pred <- survival_srm[ + as.character(FEATURE) == "F1" & + as.character(LABEL) == "H" & + as.character(RUN) == "R1", + predicted +] +expect_true( + length(h_cens_pred) > 0 && all(is.na(h_cens_pred)), + info = "MSstatsSummarizeSingleTMP SRM: censored H rows must NOT receive an imputed predicted value" +) + +# Censored L row: predicted must be a finite imputed value +l_cens_pred <- survival_srm[ + as.character(FEATURE) == "F1" & + as.character(LABEL) == "L" & + as.character(RUN) == "R2", + predicted +] +expect_true( + length(l_cens_pred) > 0 && all(is.finite(l_cens_pred)), + info = "MSstatsSummarizeSingleTMP SRM: censored L rows must receive a finite imputed predicted value" +) diff --git a/inst/tinytest/test_utils_summarization.R b/inst/tinytest/test_utils_summarization.R index 959c8aca..a6858fae 100644 --- a/inst/tinytest/test_utils_summarization.R +++ b/inst/tinytest/test_utils_summarization.R @@ -1,3 +1,4 @@ +# --- .fitLinearModel tests -------------------------------------------------- single_protein <- data.table::data.table( PROTEIN = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19" ), PEPTIDE = c( "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2" ), @@ -76,3 +77,93 @@ expect_true( any(grepl("ref_covariate", names(coef(lm_fit)))), info = "ref_covariate: model coefficients must include 'ref_covariate' terms" ) + +# --- .fitTukey(is_labeled_reference=FALSE): result includes LABEL column ----- + +# Multi-feature input — exercises .fitTukey +tukey_mf <- data.table::data.table( + PROTEIN = rep("P1", 16), + FEATURE = factor(rep(c("F1","F2"), each = 8)), + LABEL = rep(rep(c("L","L","L","L","H","H","H","H"), 1), 2), + RUN = factor(rep(c("R1","R2","R3","R4","R1","R2","R3","R4"), 2)), + newABUNDANCE = c(10,11,12,13, 8,9,10,11, 10.2,11.2,12.2,13.2, 8.2,9.2,10.2,11.2) +) + +result_tukey_turnover <- MSstats:::.fitTukey(tukey_mf, is_labeled_reference = FALSE) + +expect_true( + "LABEL" %in% colnames(result_tukey_turnover), + info = ".fitTukey(is_labeled_reference=FALSE): result must include LABEL column" +) +expect_true( + "LogIntensities" %in% colnames(result_tukey_turnover), + info = ".fitTukey(is_labeled_reference=FALSE): result must have LogIntensities column" +) + +result_tukey_srm <- MSstats:::.fitTukey(tukey_mf, is_labeled_reference = TRUE) +expect_true( + "LABEL" %in% colnames(result_tukey_srm) && all(result_tukey_srm$LABEL == "L"), + info = ".fitTukey(is_labeled_reference=TRUE): SRM path must have label column" +) + + +# Single-feature input — exercises the non-.fitTukey branch +tukey_sf <- data.table::data.table( + PROTEIN = rep("P1", 8), + FEATURE = factor(rep("F1", 8)), + LABEL = rep(c("L","H"), each = 4), + RUN = factor(rep(c("R1","R2","R3","R4"), 2)), + newABUNDANCE = c(10, 11, 12, 13, 9, 10, 11, 12) +) + +result_turnover <- MSstats:::.runTukey( + tukey_sf, is_labeled_reference = FALSE, + censored_symbol = NULL, remove50missing = FALSE +) + +expect_true( + "LABEL" %in% colnames(result_turnover), + info = ".runTukey(is_labeled_reference=FALSE): result must have LABEL column" +) +expect_true( + "L" %in% as.character(result_turnover$LABEL), + info = ".runTukey(is_labeled_reference=FALSE): result must contain L rows" +) +expect_true( + "H" %in% as.character(result_turnover$LABEL), + info = ".runTukey(is_labeled_reference=FALSE): result must contain H rows" +) +expect_true( + "LogIntensities" %in% colnames(result_turnover), + info = ".runTukey(is_labeled_reference=FALSE): result must have LogIntensities column" +) + +result_srm <- MSstats:::.runTukey( + tukey_sf, is_labeled_reference = TRUE, + censored_symbol = NULL, remove50missing = FALSE +) +expect_true( + "LABEL" %in% colnames(result_srm) && all(result_srm$LABEL == "L"), + info = ".runTukey(is_labeled_reference=TRUE): SRM path must not return LABEL column" +) + +# --- .getNonMissingFilterStats ----------------------- + +nonmiss_input <- data.table::data.table( + LABEL = c("L","L","H","H"), + newABUNDANCE = c(10, NA, 8, NA), + INTENSITY = c(100L, NA_integer_, 80L, NA_integer_), + censored = c(FALSE, TRUE, FALSE, TRUE), + n_obs_run = c(2L, 2L, 2L, 2L), + n_obs = c(4L, 4L, 4L, 4L) +) + +filter_result <- MSstats:::.getNonMissingFilterStats(nonmiss_input, censored_symbol = "NA") +expect_true( + filter_result[nonmiss_input$LABEL == "L" & !is.na(nonmiss_input$newABUNDANCE)], + info = ".getNonMissingFilterStats: L non-missing row must be TRUE" +) +expect_true( + filter_result[nonmiss_input$LABEL == "H" & !is.na(nonmiss_input$newABUNDANCE)], + info = ".getNonMissingFilterStats: H non-missing row must also be TRUE" +) diff --git a/inst/tinytest/test_utils_summarization_prepare.R b/inst/tinytest/test_utils_summarization_prepare.R index e4f70821..bbaedeed 100644 --- a/inst/tinytest/test_utils_summarization_prepare.R +++ b/inst/tinytest/test_utils_summarization_prepare.R @@ -72,3 +72,65 @@ expect_false( info = "ref_covariate: must NOT be created for label-free (single LABEL) data" ) +# --- .prepareLinear: n_obs computed per PROTEIN+FEATURE+LABEL ---------------- + +make_two_label_input <- function(is_labeled_ref = FALSE) { + data.table::data.table( + PROTEIN = rep("P1", 8), + FEATURE = factor(rep(c("F1", "F2"), each = 4)), + LABEL = rep(c("L","L","H","H"), 2), + RUN = factor(rep(c("R1","R2","R1","R2"), 2)), + ABUNDANCE = c(10, 11, 8, 9, 10.5, 11.5, 8.5, 9.5), + INTENSITY = rep(100L, 8), + ANOMALYSCORES = rep(NA_real_, 8), + is_labeled_ref = is_labeled_ref + ) +} + +result_tmp_unlabeled <- MSstats:::.prepareSummary( + make_two_label_input(), + impute = FALSE, censored_symbol = NULL, + is_labeled_reference = FALSE +) + +# Each FEATURE+LABEL combination has 2 runs → n_obs per label = 2 +expect_equal( + unique(result_tmp_unlabeled[LABEL == "L" & FEATURE == "F1", n_obs]), + 2L, + info = ".prepareTMP(is_labeled_reference=FALSE): n_obs must be per-label (2 L runs)" +) +expect_equal( + unique(result_tmp_unlabeled[LABEL == "H" & FEATURE == "F1", n_obs]), + 2L, + info = ".prepareTMP(is_labeled_reference=FALSE): H n_obs must be counted independently" +) + +# total_features per PROTEIN+LABEL +expect_equal( + unique(result_tmp_unlabeled[LABEL == "L", total_features]), + 2L, + info = ".prepareTMP(is_labeled_reference=FALSE): total_features for L must be 2" +) +expect_equal( + unique(result_tmp_unlabeled[LABEL == "H", total_features]), + 2L, + info = ".prepareTMP(is_labeled_reference=FALSE): total_features for H must be 2" +) + +result_tmp_srm <- MSstats:::.prepareSummary( + make_two_label_input(rep(c(FALSE, FALSE, TRUE, TRUE), 2)), + impute = FALSE, censored_symbol = NULL, + is_labeled_reference = TRUE +) + +# H rows share the L-derived n_obs for their feature (= 2); they must not get 0 +expect_equal( + unique(result_tmp_srm[LABEL == "H", n_obs]), + 2L, + info = ".prepareTMP(is_labeled_reference=TRUE): H rows must share n_obs with L rows (not 0)" +) +expect_true( + all(result_tmp_srm[LABEL == "H", n_obs_run] > 0), + info = ".prepareTMP(is_labeled_reference=TRUE): H rows must have n_obs_run > 0" +) + diff --git a/man/dot-fitTukey.Rd b/man/dot-fitTukey.Rd index 651f63bf..097890a8 100644 --- a/man/dot-fitTukey.Rd +++ b/man/dot-fitTukey.Rd @@ -4,10 +4,16 @@ \alias{.fitTukey} \title{Fit tukey median polish for a data matrix} \usage{ -.fitTukey(input) +.fitTukey(input, is_labeled_reference) } \arguments{ \item{input}{data.table with data for a single protein} + +\item{is_labeled_reference}{logical, if TRUE, H channel is used as a +normalization reference (SRM experiment): L abundances are adjusted by +subtracting the H value and adding back the H median, and only L results +are returned. If FALSE (e.g. protein turnover), each label is summarized +independently and results for all labels are returned.} } \value{ data.table diff --git a/man/dot-prepareLinear.Rd b/man/dot-prepareLinear.Rd deleted file mode 100644 index 4a4f3b4e..00000000 --- a/man/dot-prepareLinear.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_summarization_prepare.R -\name{.prepareLinear} -\alias{.prepareLinear} -\title{Prepare feature-level data for linear summarization} -\usage{ -.prepareLinear(input, impute, censored_symbol) -} -\arguments{ -\item{input}{data.table} - -\item{impute}{logical} - -\item{censored_symbol}{"0"/"NA"} -} -\value{ -data.table -} -\description{ -Prepare feature-level data for linear summarization -} -\keyword{internal} diff --git a/man/dot-prepareSummary.Rd b/man/dot-prepareSummary.Rd index 7be7b47f..710961c4 100644 --- a/man/dot-prepareSummary.Rd +++ b/man/dot-prepareSummary.Rd @@ -4,16 +4,19 @@ \alias{.prepareSummary} \title{Prepare feature-level data for summarization} \usage{ -.prepareSummary(input, method, impute, censored_symbol) +.prepareSummary(input, impute, censored_symbol, is_labeled_reference = FALSE) } \arguments{ \item{input}{data.table} -\item{method}{"TMP" / "linear"} - \item{impute}{logical} \item{censored_symbol}{"0"/"NA"} + +\item{is_labeled_reference}{logical, if TRUE the H channel is a normalization +reference (SRM) and grouping keys do not include LABEL; if FALSE (e.g. +protein turnover) LABEL is added to grouping keys so each label is +processed independently.} } \value{ data.table diff --git a/man/dot-prepareTMP.Rd b/man/dot-prepareTMP.Rd deleted file mode 100644 index 506f083f..00000000 --- a/man/dot-prepareTMP.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_summarization_prepare.R -\name{.prepareTMP} -\alias{.prepareTMP} -\title{Prepare feature-level data for TMP summarization} -\usage{ -.prepareTMP(input, impute, censored_symbol) -} -\arguments{ -\item{input}{data.table} - -\item{impute}{logical} - -\item{censored_symbol}{"0"/"NA"} -} -\value{ -data.table -} -\description{ -Prepare feature-level data for TMP summarization -} -\keyword{internal} diff --git a/man/dot-runTukey.Rd b/man/dot-runTukey.Rd index 038d2911..76b6ab1b 100644 --- a/man/dot-runTukey.Rd +++ b/man/dot-runTukey.Rd @@ -4,12 +4,16 @@ \alias{.runTukey} \title{Fit Tukey median polish} \usage{ -.runTukey(input, is_labeled, censored_symbol, remove50missing) +.runTukey(input, is_labeled_reference, censored_symbol, remove50missing) } \arguments{ \item{input}{data.table with data for a single protein} -\item{is_labeled}{logical, if TRUE, data is coming from an SRM experiment} +\item{is_labeled_reference}{logical, if TRUE, H channel is used as a +normalization reference (SRM experiment): L abundances are adjusted by +subtracting the H value and adding back the H median, and only L results +are returned. If FALSE (e.g. protein turnover), each label is summarized +independently and results for all labels are returned.} \item{censored_symbol}{Missing values are censored or at random. 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.