Vitek-Lab · devonjkohler · Sep 10, 2025 · Apr 4, 2025 · Apr 13, 2025 · Apr 13, 2025
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -9,8 +9,8 @@ make_contrast_run_quant <- function(input, coefs, contrast_matrix, counts, is_la
     .Call(`_MSstats_make_contrast_run_quant`, input, coefs, contrast_matrix, counts, is_labeled, is_reference)
 }
 
-get_linear_summary <- function(input, coefs, counts, is_labeled) {
-    .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled)
+get_linear_summary <- function(input, coefs, counts, is_labeled, cov_mat) {
+    .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled, cov_mat)
 }
 
 median_polish_summary <- function(x, eps = 0.01, maxiter = 10L) {

diff --git a/R/dataProcess.R b/R/dataProcess.R
@@ -33,7 +33,8 @@
 #' Smaller numbers can be set to improve processing times. This option is by default on 
 #' at a high number (100) to improve processing times without affecting differential analysis.
 #' @param summaryMethod "TMP" (default) means Tukey's median polish, 
-#' which is robust estimation method. "linear" uses linear mixed model.
+#' which is robust estimation method. "linear" uses linear mixed model. If 
+#' anomaly detection algorithm is performed, "linear" must be used.
 #' @param equalFeatureVar only for summaryMethod = "linear". default is TRUE. 
 #' Logical variable for whether the model should account for heterogeneous variation 
 #' among intensities from different features. Default is TRUE, which assume equal 
@@ -58,6 +59,7 @@
 #' @param numberOfCores Number of cores for parallel processing. When > 1, 
 #' a logfile named `MSstats_dataProcess_log_progress.log` is created to 
 #' track progress. Only works for Linux & Mac OS. Default is 1.
+#' @param aft_iterations Number of iterations for AFT model fitting. Default is 90.
 #' @inheritParams .documentFunction
 #' 
 #' @importFrom utils sessionInfo
@@ -126,7 +128,7 @@ dataProcess = function(
     equalFeatureVar = TRUE, censoredInt = "NA", MBimpute = TRUE, 
     remove50missing = FALSE, fix_missing = NULL, maxQuantileforCensored = 0.999, 
     use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL,
-    numberOfCores = 1
+    numberOfCores = 1, aft_iterations=90
 ) {
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, 
                                         log_file_path,
@@ -137,7 +139,8 @@ dataProcess = function(
         list(method = featureSubset, n_top = n_top_feature,
              remove_uninformative = remove_uninformative_feature_outlier),
         list(method = summaryMethod, equal_var = equalFeatureVar),
-        list(symbol = censoredInt, MB = MBimpute))
+        list(symbol = censoredInt, MB = MBimpute),
+        colnames(raw))
 
     peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
     input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
@@ -159,7 +162,7 @@ dataProcess = function(
     summarized = tryCatch(MSstatsSummarizeWithMultipleCores(input, summaryMethod,
                                            MBimpute, censoredInt, 
                                            remove50missing, equalFeatureVar, 
-                                           numberOfCores),
+                                           numberOfCores, aft_iterations),
                           error = function(e) {
                               print(e)
                               NULL
@@ -197,13 +200,15 @@ dataProcess = function(
 #' @param numberOfCores Number of cores for parallel processing. When > 1, 
 #' a logfile named `MSstats_dataProcess_log_progress.log` is created to 
 #' track progress. Only works for Linux & Mac OS. Default is 1.
+#' @param aft_iterations Number of iterations for AFT model fitting. Default is 90.
 #' 
-#' @importFrom parallel makeCluster parLapply stopCluster clusterExport  
+#' @importFrom parallel makeCluster parLapply stopCluster clusterExport
 #' 
 #' @return list of length one with run-level data.
 #' 
 MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol,
-                              remove50missing, equal_variance, numberOfCores = 1) {
+                              remove50missing, equal_variance, numberOfCores = 1,
+                              aft_iterations = 90) {
     if (numberOfCores > 1) {
         protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
         num_proteins = length(protein_indices)
@@ -227,7 +232,8 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
                 }
                 single_protein = input[protein_indices[[i]],]
                 MSstatsSummarizeSingleTMP(
-                    single_protein, impute, censored_symbol, remove50missing)
+                    single_protein, impute, censored_symbol, remove50missing,
+                    aft_iterations)
             })
         } else {
             summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) {
@@ -236,20 +242,29 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
                         sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE)
                 }
                 single_protein = input[protein_indices[[i]],]
-                MSstatsSummarizeSingleLinear(single_protein, equal_variance)
+                MSstatsSummarizeSingleLinear(
+                    single_protein,
+                    impute, 
+                    censored_symbol, 
+                    remove50missing,
+                    aft_iterations)
             })
         }
         parallel::stopCluster(cl)
         return(summarized_results)
     } else {
-        return(MSstatsSummarizeWithSingleCore(input, method, impute, censored_symbol, 
-                                remove50missing, equal_variance))
+        return(MSstatsSummarizeWithSingleCore(input, method, impute, 
+                                              censored_symbol, 
+                                              remove50missing, 
+                                              equal_variance,
+                                              aft_iterations))
     }
 }
 
 #' Feature-level data summarization with 1 core
 #' 
 #' @inheritParams MSstatsSummarizeWithMultipleCores
+#' @param aft_iterations Number of iterations for AFT model fitting. Default is 90.
 #' 
 #' @importFrom data.table uniqueN
 #' @importFrom utils setTxtProgressBar
@@ -271,12 +286,12 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
 #' input = MSstatsSelectFeatures(input, "all")
 #' processed = getProcessed(input)
 #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
+#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE, 100)
 #' length(summarized) # list of summarization outputs for each protein
 #' head(summarized[[1]][[1]]) # run-level summary
 #' 
 MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol,
-                            remove50missing, equal_variance) {
+                            remove50missing, equal_variance, aft_iterations = 90) {
 
 
     protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
@@ -287,16 +302,19 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol
         for (protein_id in seq_len(num_proteins)) {
             single_protein = input[protein_indices[[protein_id]],]
             summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP(
-                single_protein, impute, censored_symbol, remove50missing)
+                single_protein, impute, censored_symbol, remove50missing, 
+                aft_iterations)
             setTxtProgressBar(pb, protein_id)
         }
         close(pb)
     } else {
         pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
         for (protein_id in seq_len(num_proteins)) {
             single_protein = input[protein_indices[[protein_id]],]
-            summarized_result = MSstatsSummarizeSingleLinear(single_protein,
-                                                             equal_variance)
+            summarized_result = MSstatsSummarizeSingleLinear(
+                single_protein, impute, censored_symbol, 
+              remove50missing, aft_iterations)
+
             summarized_results[[protein_id]] = summarized_result
             setTxtProgressBar(pb, protein_id)
         }
@@ -308,6 +326,10 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol
 #' Linear model-based summarization for a single protein
 #' 
 #' @param single_protein feature-level data for a single protein
+#' @param impute boolean for whether imputation should be performed
+#' @param censored_symbol Character string indicating how censored values are represented 
+#' @param remove50missing if TRUE, proteins with more than 50\% missing values in each run are removed
+#' @param aft_iterations number of iterations for AFT model fitting
 #' @param equal_variances if TRUE, observation are assumed to be homoskedastic
 #' 
 #' @return list with protein-level data
@@ -330,14 +352,53 @@ MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol
 #' input = MSstatsSelectFeatures(input, "all")
 #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
 #' input_split = split(input, input$PROTEIN)
-#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]])
+#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]], impute, cens, TRUE, 100)
 #' head(single_protein_summary[[1]])
 #' 
-MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) {
+MSstatsSummarizeSingleLinear = function(single_protein, 
+                                        impute,
+                                        censored_symbol, 
+                                        remove50missing, 
+                                        aft_iterations = 90,
+                                        equal_variances = TRUE) {
     ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL
 
+    cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
+                                                 "FEATURE", "ref"))
+    single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) &
+                                        (n_obs_run > 0 & !is.na(n_obs_run))]
+    if (nrow(single_protein) == 0) {
+        return(list(NULL, NULL))
+    }
+    single_protein[, RUN := factor(RUN)]
+    single_protein[, FEATURE := factor(FEATURE)]
+    if (impute & any(single_protein[["censored"]])) {
+        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
+                                                   with = FALSE],
+                                    aft_iterations)
+        sigma2 = survival_fit$scale^2
+        single_protein[, c("predicted", "imputation_var") := {
+            pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
+            list(pred$fit, pred$se.fit^2 + sigma2)
+        }]
-    if (impute & any(single_protein[["censored"]])) {
-        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
-                                                   with = FALSE],
-                                    aft_iterations)
-        sigma2 = survival_fit$scale^2
-        single_protein[, c("predicted", "imputation_var") := {
-            pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
-            list(pred$fit, pred$se.fit^2 + sigma2)
-        }]
+    if (impute & any(single_protein[["censored"]])) {
+        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
+                                                   with = FALSE],
+                                    aft_iterations)
+        if (!is.null(survival_fit)) {
+            sigma2 = survival_fit$scale^2
+            single_protein[, c("predicted", "imputation_var") := {
+                pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
+                list(pred$fit, pred$se.fit^2 + sigma2)
+            }]
+        } else {
+            msg = paste("*** warning: Survival model fitting failed for protein",
+                        unique(single_protein$PROTEIN))
+            getOption("MSstatsLog")("WARN", msg)
+            single_protein[, c("predicted", "imputation_var") := list(NA_real_, NA_real_)]
+        }
-    if (impute & any(single_protein[["censored"]])) {
-        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
-                                                   with = FALSE],
-                                    aft_iterations)
-        sigma2 = survival_fit$scale^2
-        single_protein[, c("predicted", "imputation_var") := {
-            pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
-            list(pred$fit, pred$se.fit^2 + sigma2)
-        }]
+    if (impute & any(single_protein[["censored"]])) {
+        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
+                                                   with = FALSE],
+                                    aft_iterations)
+        if (!is.null(survival_fit)) {
+            sigma2 = survival_fit$scale^2
+            single_protein[, c("predicted", "imputation_var") := {
+                pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
+                list(pred$fit, pred$se.fit^2 + sigma2)
+            }]
+        } else {
+            msg = paste("*** warning: Survival model fitting failed for protein",
+                        unique(single_protein$PROTEIN))
+            getOption("MSstatsLog")("WARN", msg)
+            single_protein[, c("predicted", "imputation_var") := list(NA_real_, NA_real_)]
+        }
+        single_protein[, predicted := ifelse(censored & (LABEL == "L"), 
+                                             predicted, NA)]
+        single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
+                                                predicted, newABUNDANCE)]
+        survival = single_protein[, c(cols, "predicted"), with = FALSE]
+    } else {
+        survival = single_protein[, cols, with = FALSE]
+        survival[, predicted := NA]
+    }
+
+    if (all(!is.na(single_protein$ANOMALYSCORES))){
+        single_protein$weights = 1 / single_protein$ANOMALYSCORES
+    } else {
+        single_protein$weights = NA
+    }
-    if (all(!is.na(single_protein$ANOMALYSCORES))){
-        single_protein$weights = 1 / single_protein$ANOMALYSCORES
-    } else {
-        single_protein$weights = NA
-    }
+    if (all(!is.na(single_protein$ANOMALYSCORES))) {
+        if (any(single_protein$ANOMALYSCORES <= 0 |
+                is.infinite(single_protein$ANOMALYSCORES))) {
+            msg = paste(
+                "*** warning: Invalid anomaly scores detected for protein",
+                unique(single_protein$PROTEIN),
+                "- weights will not be applied"
+            )
+            getOption("MSstatsLog")("WARN", msg)
+            single_protein$weights = NA
+        } else {
+            single_protein$weights = 1 / single_protein$ANOMALYSCORES
+        }
+    } else {
+        single_protein$weights = NA
+    }
-    if (all(!is.na(single_protein$ANOMALYSCORES))){
-        single_protein$weights = 1 / single_protein$ANOMALYSCORES
-    } else {
-        single_protein$weights = NA
-    }
+    if (all(!is.na(single_protein$ANOMALYSCORES))) {
+        if (any(single_protein$ANOMALYSCORES <= 0 |
+                is.infinite(single_protein$ANOMALYSCORES))) {
+            msg = paste(
+                "*** warning: Invalid anomaly scores detected for protein",
+                unique(single_protein$PROTEIN),
+                "- weights will not be applied"
+            )
+            getOption("MSstatsLog")("WARN", msg)
+            single_protein$weights = NA
+        } else {
+            single_protein$weights = 1 / single_protein$ANOMALYSCORES
+        }
+    } else {
+        single_protein$weights = NA
+    }
+
     label = data.table::uniqueN(single_protein$LABEL) > 1
-    single_protein = single_protein[!is.na(ABUNDANCE)]
+    single_protein = single_protein[!is.na(newABUNDANCE)]
     single_protein[, RUN := factor(RUN)]
     single_protein[, FEATURE := factor(FEATURE)]
 
@@ -346,28 +407,35 @@ MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE)
     counts = as.matrix(counts)
     is_single_feature = .checkSingleFeature(single_protein)
 
-    fit = try(.fitLinearModel(single_protein, is_single_feature, is_labeled = label, 
-                              equal_variances), silent = TRUE)
+    # fit = try(, silent = TRUE)
+    fit = .fitLinearModel(single_protein, is_single_feature, 
+                          is_labeled = label, equal_variances)
 
     if (inherits(fit, "try-error")) {
-        msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
+        msg = paste("*** error : can't fit the model for ", 
+                    unique(single_protein$PROTEIN))
-    # fit = try(, silent = TRUE)
-    fit = .fitLinearModel(single_protein, is_single_feature, 
-                          is_labeled = label, equal_variances)
-    
-    if (inherits(fit, "try-error")) {
-        msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
-        msg = paste("*** error : can't fit the model for ", 
-                    unique(single_protein$PROTEIN))
+    fit <- try(
+        .fitLinearModel(single_protein, is_single_feature, 
+                        is_labeled = label, equal_variances),
+        silent = TRUE
+    )
+    
+    if (inherits(fit, "try-error")) {
+        msg = paste("*** error : can't fit the model for ", 
+                    unique(single_protein$PROTEIN))
+        ...
-    # fit = try(, silent = TRUE)
-    fit = .fitLinearModel(single_protein, is_single_feature, 
-                          is_labeled = label, equal_variances)
-    
-    if (inherits(fit, "try-error")) {
-        msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
-        msg = paste("*** error : can't fit the model for ", 
-                    unique(single_protein$PROTEIN))
+    fit <- try(
+        .fitLinearModel(single_protein, is_single_feature, 
+                        is_labeled = label, equal_variances),
+        silent = TRUE
+    )
+    
+    if (inherits(fit, "try-error")) {
+        msg = paste("*** error : can't fit the model for ", 
+                    unique(single_protein$PROTEIN))
+        ...
         getOption("MSstatsLog")("WARN", msg)
         getOption("MSstatsMsg")("WARN", msg)
         result = NULL
     } else {
         cf = summary(fit)$coefficients[, 1]
+        cov_mat = vcov(fit)
+
         result = unique(single_protein[, list(Protein = PROTEIN, RUN = RUN)])
-        log_intensities = get_linear_summary(single_protein, cf,
-                                             counts, label)
-        result[, LogIntensities := log_intensities]
+        extracted_values = get_linear_summary(single_protein, cf,
+                                             counts, label, cov_mat)
+        # extracted_values = get_run_estimates_simple(fit, single_protein, counts)
+
+        result = cbind(result, extracted_values)
     }
-    list(result)
+    list(result, survival)
 }
 
 
 #' Tukey Median Polish summarization for a single protein
 #' 
 #' @param single_protein feature-level data for a single protein
+#' @param aft_iterations number of iterations for AFT model fitting
 #' @inheritParams MSstatsSummarizeWithSingleCore
 #' 
 #' @return list of two data.tables: one with fitted survival model,
@@ -392,11 +460,11 @@ MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE)
 #' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
 #' input_split = split(input, input$PROTEIN)
 #' single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]],
-#'                                                    impute, cens, FALSE)
+#'                                                    impute, cens, FALSE, 100)
 #' head(single_protein_summary[[1]])
 #' 
 MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, 
-                                     remove50missing) {
+                                     remove50missing, aft_iterations = 90) {
     newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
     predicted = censored = NULL
     cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
@@ -409,10 +477,27 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
     single_protein[, RUN := factor(RUN)]
     single_protein[, FEATURE := factor(FEATURE)]
     if (impute & any(single_protein[["censored"]])) {
-        survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
-                                                   with = FALSE])
-        single_protein[, predicted := predict(survival_fit,
-                                              newdata = .SD)]
+
+        # Flag to track convergence warning
+        converged = TRUE
+
+        # Try to fit survival model and catch convergence warnings
+        survival_fit = withCallingHandlers({
+            .fitSurvival(single_protein[LABEL == "L", cols, with = FALSE], 
+                         aft_iterations)
+        }, warning = function(w) {
+            if (grepl("converge", conditionMessage(w), ignore.case = TRUE)) {
+                message("Convergence warning caught: ", conditionMessage(w))
+                converged <<- FALSE
+            }
+        })
+
+        if (converged) {
+            single_protein[, predicted := predict(survival_fit, newdata = .SD)]
+        } else {
+            single_protein[, predicted := NA_real_]
+        }
+
         single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)]
         single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
                                                 predicted, newABUNDANCE)]

diff --git a/R/groupComparison.R b/R/groupComparison.R
@@ -99,12 +99,13 @@ groupComparison = function(contrast.matrix, data,
     samples_info = getSamplesInfo(data)
     groups = unique(data$ProteinLevelData$GROUP)
     contrast_matrix = MSstatsContrastMatrix(contrast.matrix, groups)
-    getOption("MSstatsLog")("INFO",
-                            "== Start to test and get inference in whole plot")
-    getOption("MSstatsMsg")("INFO",
-                            " == Start to test and get inference in whole plot ...")
+    getOption("MSstatsLog")(
+        "INFO", "== Start to test and get inference in whole plot")
+    getOption("MSstatsMsg")(
+        "INFO", " == Start to test and get inference in whole plot ...")
     testing_results = MSstatsGroupComparison(split_summarized, contrast_matrix,
-                                             save_fitted_models, repeated, samples_info, 
+                                             save_fitted_models, 
+                                             repeated, samples_info, 
                                              numberOfCores)
     getOption("MSstatsLog")("INFO",
                             "== Comparisons for all proteins are done.")
@@ -179,13 +180,14 @@ MSstatsGroupComparison = function(summarized_list, contrast_matrix,
                                   save_fitted_models, repeated, samples_info, 
                                   numberOfCores = 1) {
     if (numberOfCores > 1) {
-        return(.groupComparisonWithMultipleCores(summarized_list, contrast_matrix, 
+        return(.groupComparisonWithMultipleCores(summarized_list, 
+                                                 contrast_matrix,
                                                  save_fitted_models, repeated, 
                                                  samples_info, numberOfCores))
     } else {
         return(.groupComparisonWithSingleCore(summarized_list, contrast_matrix, 
-                                              save_fitted_models, repeated, 
-                                              samples_info))
+                                              save_fitted_models, 
+                                              repeated, samples_info))
     }
 }
 
@@ -276,16 +278,17 @@ MSstatsGroupComparisonOutput = function(input, summarization_output, log_base =
 #' single_output # same as a single element of MSstatsGroupComparison output
 #' 
 MSstatsGroupComparisonSingleProtein = function(single_protein, contrast_matrix,
-                                               repeated, groups, samples_info,
-                                               save_fitted_models,
+                                               repeated, groups, 
+                                               samples_info, save_fitted_models,
                                                has_imputed) {
     single_protein = .prepareSingleProteinForGC(single_protein)
     is_single_subject = .checkSingleSubject(single_protein)
     has_tech_reps = .checkTechReplicate(single_protein)
 
     fitted_model = try(.fitModelSingleProtein(single_protein, contrast_matrix,
-                                              has_tech_reps, is_single_subject,
-                                              repeated, groups, samples_info,
+                                              has_tech_reps, 
+                                              is_single_subject, repeated, 
+                                              groups, samples_info,
                                               save_fitted_models, has_imputed),
                        silent = TRUE)
     if (inherits(fitted_model, "try-error")) {