Vitek-Lab · devonjkohler · Sep 9, 2025 · Aug 26, 2025 · Sep 5, 2025 · Sep 8, 2025
diff --git a/R/dataProcess.R b/R/dataProcess.R
@@ -136,7 +136,8 @@ dataProcess = function(
         list(method = featureSubset, n_top = n_top_feature,
              remove_uninformative = remove_uninformative_feature_outlier),
         list(method = summaryMethod, equal_var = equalFeatureVar),
-        list(symbol = censoredInt, MB = MBimpute))
+        list(symbol = censoredInt, MB = MBimpute),
+        colnames(raw))
 
     peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
     input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
@@ -203,7 +204,7 @@ dataProcess = function(
 #' 
 MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol,
                               remove50missing, equal_variance, numberOfCores = 1,
-                              aft_iterations) {
+                              aft_iterations = 90) {
     if (numberOfCores > 1) {
         protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
         num_proteins = length(protein_indices)
@@ -285,7 +286,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
 #' head(summarized[[1]][[1]]) # run-level summary
 #' 
 MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol,
-                            remove50missing, equal_variance, aft_iterations) {
+                            remove50missing, equal_variance, aft_iterations = 90) {
 
 
     protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
@@ -419,7 +420,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
                                         impute,
                                         censored_symbol, 
                                         remove50missing, 
-                                        aft_iterations,
+                                        aft_iterations = 90,
                                         equal_variances = TRUE) {
     ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL
 
@@ -523,7 +524,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
 #' head(single_protein_summary[[1]])
 #' 
 MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, 
-                                     remove50missing, aft_iterations) {
+                                     remove50missing, aft_iterations = 90) {
     newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
     predicted = censored = NULL
     cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",

diff --git a/R/utils_checks.R b/R/utils_checks.R
@@ -86,15 +86,23 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
 #' @param feature_selection list with elements: remove_uninformative
 #' @param summarization list with elements: method.
 #' @param imputation list with elements: cutoff, symbol.
+#' @param input_columns character vector of input columns
 #' @keywords internal
 .checkDataProcessParams = function(log_base, normalization_method,
                                    standards_names, feature_selection, 
-                                   summarization, imputation) {
+                                   summarization, imputation, input_columns) {
     checkmate::assertChoice(log_base, c(2, 10), .var.name = "logTrans")
     checkmate::assertChoice(summarization$method, c("linear", "TMP"),
-                            .var.name = "summaryMethod") 
+                            .var.name = "summaryMethod")
     getOption("MSstatsLog")("INFO", paste("Summary method:", 
                                           summarization$method))
+    if ("AnomalyScores" %in% input_columns) {
+        if (summarization$method != "linear") {
+            stop("AnomalyScores column detected in your input columns.  ",
+                 "Please set summaryMethod to 'linear' to use anomaly scores for protein summarization, ",
+                 "or remove the AnomalyScores column from your input data.")
+        }
+    }
     checkmate::assertChoice(imputation$symbol, c("0", "NA"), 
                             null.ok = TRUE, .var.name = "censoredInt")
     getOption("MSstatsLog")("INFO", paste("censoredInt:", imputation$symbol))

diff --git a/inst/tinytest/test_dataProcess.R b/inst/tinytest/test_dataProcess.R
@@ -24,4 +24,187 @@ msstats_input_fractions_techreps = data.table::fread(
 QuantDataTechRepsFractions = dataProcess(msstats_input_fractions_techreps, 
                                          use_log_file = FALSE)
 expect_true(!is.null(QuantDataTechRepsFractions))
-expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)
+expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)
+
+# Test dataProcess with linear summary method and anomaly scores --------------
+msstats_input = data.table::data.table(
+    ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)),
+    PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5),
+                        rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5),
+                        rep("LowAbundancePeptide", 5)),
+    PrecursorCharge = rep(2, 25),
+    FragmentIon = rep("y3", 25),
+    ProductCharge = rep(1, 25),
+    IsotopeLabelType = rep("L", 25),
+    Condition = rep(c("A", "A", "A", "B", "B"), 5),
+    BioReplicate = rep(seq(1:5), 5),
+    Run = rep(paste0("Run", seq(1:5)), 5),
+    Intensity = c(1000, 1500, 2000, 2500, 3000,
+                  1100, 1600, 2100, 2600, 3100,
+                  1200, 1700, 2200, 2000, 1700,
+                  1300, 1800, 1200, 2800, 1800,
+                  100, 200, 300, 400, 500),
+    AnomalyScores = c(rep(0.03, 10), c(0.01,0.01,0.01,0.9,0.8), c(0.01,0.01,0.01,0.7,0.5), rep(0.10, 5))
+)
+
+# Must run with linear summarization
+expect_error(
+    dataProcess(msstats_input, summaryMethod="TMP"),
+    pattern = "AnomalyScores column detected in your input columns.*set summaryMethod to 'linear'",
+    info = "Should error when AnomalyScores is present but summaryMethod is not 'linear'"
+)
+
+# Process data with linear summarization
+QuantDataLinearAnomaly = dataProcess(msstats_input,
+                                     normalization=FALSE, # no normalization
+                                     MBimpute=TRUE, # Use imputation
+                                     summaryMethod="linear" # Key MSstats+ parameter
+)
+
+expect_true("Variance" %in% colnames(QuantDataLinearAnomaly$ProteinLevelData),
+            info = "Variance column should be present in ProteinLevelData")
+protein_data = QuantDataLinearAnomaly$ProteinLevelData
+expect_true(all(protein_data$Variance > 0, na.rm = TRUE),
+            info = "All variance values should be positive")
+expect_true(all(is.finite(protein_data$Variance), na.rm = TRUE),
+            info = "All variance values should be finite")
+variance_values = protein_data[protein_data$Protein == "Q9UFW8"]$Variance
+expect_true(all(variance_values == variance_values[1]),
+            info = paste("Not all variance values are equal for Q9UFW8"))
+
+# Create comparison dataset with low anomaly scores for contrast
+msstats_input_low_anomaly = data.table::data.table(
+    ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)),
+    PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5),
+                        rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5),
+                        rep("LowAbundancePeptide", 5)),
+    PrecursorCharge = rep(2, 25),
+    FragmentIon = rep("y3", 25),
+    ProductCharge = rep(1, 25),
+    IsotopeLabelType = rep("L", 25),
+    Condition = rep(c("A", "A", "A", "B", "B"), 5),
+    BioReplicate = rep(seq(1:5), 5),
+    Run = rep(paste0("Run", seq(1:5)), 5),
+    Intensity = c(1000, 1500, 2000, 2500, 3000,
+                  1100, 1600, 2100, 2600, 3100,
+                  1200, 1700, 2200, 2000, 1700,
+                  1300, 1800, 1200, 2800, 1800,
+                  100, 200, 300, 400, 500),
+    AnomalyScores = rep(0.01, 25) # All low anomaly scores
+)
+
+QuantDataLowAnomaly = dataProcess(msstats_input_low_anomaly,
+                                  normalization=FALSE,
+                                  MBimpute=TRUE,
+                                  summaryMethod="linear")
+high_anomaly_variance = protein_data$Variance
+low_anomaly_variance = QuantDataLowAnomaly$ProteinLevelData$Variance
+expect_true(mean(high_anomaly_variance, na.rm = TRUE) > mean(low_anomaly_variance, na.rm = TRUE),
+            info = "Mean variance should be higher for dataset with high anomaly scores")
+
+
+# Verify that without AnomalyScores, different behavior occurs
+msstats_input_no_anomaly = msstats_input[, !c("AnomalyScores")]
+QuantDataNoAnomaly = dataProcess(msstats_input_no_anomaly,
+                                 normalization=FALSE,
+                                 MBimpute=TRUE,
+                                 summaryMethod="linear")
+no_anomaly_variance = QuantDataNoAnomaly$ProteinLevelData$Variance
+expect_false(identical(high_anomaly_variance, no_anomaly_variance),
+             info = "Variance should be different when AnomalyScores are present vs absent")
+
+
+
+# Test MSstatsSummarizeSingleLinear function ------------------------------
+
+single_protein = data.table::data.table(
+    PROTEIN = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19" ),
+    PEPTIDE = c( "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2" ),
+    TRANSITION = c( "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1" ),
+    FEATURE = c( "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1" ),
+    LABEL = c( "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L" ),
+    GROUP_ORIGINAL = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ),
+    SUBJECT_ORIGINAL = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5 ),
+    RUN = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ),
+    GROUP = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ),
+    SUBJECT = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ),
+    FRACTION = c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ),
+    INTENSITY = c( 1200, 1300, 100, 1700, 1800, 200, 2200, 1200, 300, 2000, 2800, 400, 1700, 1800, 500 ),
+    ANOMALYSCORES = c( 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.9, 0.7, 0.1, 0.8, 0.5, 0.1 ),
+    ABUNDANCE = c( 10.229, 10.344, 6.644, 10.731, 10.814, 7.644, 11.103, 10.229, 8.229, 10.966, 11.451, 8.644, 10.731, 10.814, 8.966 ),
+    originalRUN = c( "Run1", "Run1", "Run1", "Run2", "Run2", "Run2", "Run3", "Run3", "Run3", "Run4", "Run4", "Run4", "Run5", "Run5", "Run5" ),
+    censored = c( FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE ),
+    newABUNDANCE = c( 10.229, 10.344, NA, 10.731, 10.814, NA, 11.103, 10.229, NA, 10.966, 11.451, NA, 10.731, 10.814, NA ),
+    cen = c( 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0 ),
+    nonmissing = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ),
+    n_obs = c( 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0 ),
+    n_obs_run = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ),
+    total_features = c( 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ),
+    prop_features = c( 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667 ),
+    nonmissing_all = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ),
+    ABUNDANCE_cut = c( 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA ),
+    any_censored = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE )
+)
+
+summarized = MSstats::MSstatsSummarizeSingleLinear(
+    single_protein,
+    impute = FALSE,
+    censored_symbol = "NA", 
+    remove50missing = FALSE, 
+    aft_iterations = 90,
+    equal_variances = TRUE
+)
+
+protein_level_summary = summarized[[1]]
+feature_level_summary = summarized[[2]]
+
+## Basic tests
+expect_equal(nrow(protein_level_summary), 5)
+expect_equal(ncol(protein_level_summary), 4)
+expect_true(all(protein_level_summary$Protein == "Q96S19"))
+expect_equal(as.numeric(protein_level_summary$RUN), 1:5)
+expect_equal(nrow(feature_level_summary), 10)
+expect_equal(ncol(feature_level_summary), 5)
+expect_equal(length(unique(feature_level_summary$FEATURE)), 2)
+expect_equal(length(unique(feature_level_summary$RUN)), 5)
+
+## Verify that variance is higher in runs 4-5 with high anomaly scores
+variance_runs_1_3 = protein_level_summary[RUN %in% 1:3, Variance]
+variance_runs_4_5 = protein_level_summary[RUN %in% 4:5, Variance]
+
+for (var_45 in variance_runs_4_5) {
+    for (var_13 in variance_runs_1_3) {
+        expect_true(var_45 > var_13, 
+                  info = paste("Variance in runs 4-5 should be > variance in runs 1-3:",
+                               var_45, ">", var_13))
+    }
+}
+
+mean_variance_1_3 = mean(variance_runs_1_3)
+mean_variance_4_5 = mean(variance_runs_4_5)
+
+expect_true(mean_variance_4_5 > 10 * mean_variance_1_3,
+          info = paste("Mean variance of runs 4-5 should be at least 10x higher:",
+                       mean_variance_4_5, "vs", 10 * mean_variance_1_3))
+
+## Verify that protein levels in runs 4-5 are closer to ASGLLLER than AFPLAEWQPSDVDQR
+protein_runs_4_5 = protein_level_summary[RUN %in% 4:5]
+afpla_runs_4_5 = feature_level_summary[FEATURE == "AFPLAEWQPSDVDQR_2_y3_1" & RUN %in% 4:5]
+asgll_runs_4_5 = feature_level_summary[FEATURE == "ASGLLLER_2_y3_1" & RUN %in% 4:5]
+for (i in 1:nrow(protein_runs_4_5)) {
+    run_num = protein_runs_4_5$RUN[i]
+    protein_logint = protein_runs_4_5$LogIntensities[i]
+
+    afpla_abundance = afpla_runs_4_5[RUN == run_num, newABUNDANCE]
+    asgll_abundance = asgll_runs_4_5[RUN == run_num, newABUNDANCE]
+
+    dist_to_afpla = abs(protein_logint - afpla_abundance)
+    dist_to_asgll = abs(protein_logint - asgll_abundance)
+
+    expect_true(dist_to_asgll < dist_to_afpla,
+              info = paste("Run", run_num, ": Protein LogIntensity", protein_logint,
+                           "should be closer to ASGLLLER", asgll_abundance,
+                           "(dist =", round(dist_to_asgll, 4), ")",
+                           "than to AFPLAEWQPSDVDQR", afpla_abundance,
+                           "(dist =", round(dist_to_afpla, 4), ")"))
+}
diff --git a/inst/tinytest/test_groupComparison.R b/inst/tinytest/test_groupComparison.R
@@ -17,4 +17,47 @@ testResultParallelComparison = groupComparison(contrast.matrix=comparison,
                                            numberOfCores = 2)
 
 expect_equal(nrow(testResultDefaultComparison$ComparisonResult),
-             nrow(testResultParallelComparison$ComparisonResult))
+             nrow(testResultParallelComparison$ComparisonResult))
+
+# Test groupComparison with weights
+protein_level_data_with_anomalies = data.table::data.table(
+    RUN = c( "1", "2", "3", "4", "5", "1", "2", "3", "4", "5" ),
+    Protein = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8" ),
+    LogIntensities = c(10.28, 10.39, 10.63, 11.26, 11.25, 10.03, 10.59, 11.00, 11.31, 11.57),
+    Variance = c( 0.0020, 0.0020, 0.15, 0.13, 0.0020, rep(0.00032, 5)),
+    originalRUN = c( "Run1", "Run2", "Run3", "Run4", "Run5", "Run1", "Run2", "Run3", "Run4", "Run5" ),
+    GROUP = c( "A", "A", "A", "B", "B", "A", "A", "A", "B", "B" ),
+    SUBJECT = c( 1, 2, 3, 4, 5, 1, 2, 3, 4, 5 ),
+    TotalGroupMeasurements = c( 9, 9, 9, 6, 6, 6, 6, 6, 4, 4 ),
+    NumMeasuredFeature = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ),
+    MissingPercentage = c( 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0, 0, 0, 0, 0 ),
+    more50missing = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ),
+    NumImputedFeature = c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+)
+protein_level_data_no_anomalies = protein_level_data_with_anomalies
+protein_level_data_no_anomalies$Variance = NULL
+QuantDataNoAnomaly = list(ProteinLevelData = protein_level_data_no_anomalies)
+QuantDataWithAnomaly = list(ProteinLevelData = protein_level_data_with_anomalies)
+model_no_anomaly = groupComparison("pairwise", QuantDataNoAnomaly,
+                        use_log_file = FALSE)
+model_with_anomaly = groupComparison("pairwise", QuantDataWithAnomaly,
+                         use_log_file = FALSE)
+
+no_anomaly_Q9UFW8 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q9UFW8", ]
+with_anomaly_Q9UFW8 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q9UFW8", ]
+
+expect_true(nrow(no_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_no_anomaly")
+expect_true(nrow(with_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_with_anomaly")
+expect_equal(no_anomaly_Q9UFW8$SE, with_anomaly_Q9UFW8$SE, 
+             info = "SE values for Q9UFW8 should be the same")
+expect_equal(no_anomaly_Q9UFW8$pvalue, with_anomaly_Q9UFW8$pvalue,
+             info = "pvalue values for Q9UFW8 should be the same")
+no_anomaly_Q96S19 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q96S19", ]
+with_anomaly_Q96S19 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q96S19", ]
+expect_true(nrow(no_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_no_anomaly")
+expect_true(nrow(with_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_with_anomaly")
+expect_true(with_anomaly_Q96S19$SE < no_anomaly_Q96S19$SE,
+            info = "SE for Q96S19 should be lower in model_with_anomaly")
+expect_true(with_anomaly_Q96S19$pvalue < no_anomaly_Q96S19$pvalue,
+            info = "pvalue for Q96S19 should be lower in model_with_anomaly")
+
diff --git a/inst/tinytest/test_utils_groupComparison_model.R b/inst/tinytest/test_utils_groupComparison_model.R
@@ -0,0 +1,18 @@
+# Test for .fitModelForGroupComparison function
+test_input_with_variance = data.frame(
+    ABUNDANCE = c(1.2, 1.5, 1.8, 2.1, 1.9, 2.3),
+    GROUP = factor(c("A", "A", "A", "B", "B", "B")),
+    SUBJECT = factor(c("S1", "S2", "S3", "S1", "S2", "S3")),
+    Variance = c(0.1, 0.15, 0.08, 0.12, 0.09, 0.11)
+)
+result = MSstats:::.fitModelForGroupComparison(
+    input = test_input_with_variance,
+    repeated = FALSE,
+    is_single_subject = FALSE,
+    has_tech_replicates = FALSE
+)
+expect_false(is.null(result$full_fit$weights),
+             "Fitted model should contain weights when Variance is provided")
+expected_weights = 1/test_input_with_variance$Variance
+expect_equal(result$full_fit$weights, expected_weights,
+             info = "Model weights should match 1/Variance")