Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions R/dataProcess.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ dataProcess = function(
list(method = featureSubset, n_top = n_top_feature,
remove_uninformative = remove_uninformative_feature_outlier),
list(method = summaryMethod, equal_var = equalFeatureVar),
list(symbol = censoredInt, MB = MBimpute))
list(symbol = censoredInt, MB = MBimpute),
colnames(raw))

peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
Expand Down Expand Up @@ -203,7 +204,7 @@ dataProcess = function(
#'
MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol,
remove50missing, equal_variance, numberOfCores = 1,
aft_iterations) {
aft_iterations = 90) {
if (numberOfCores > 1) {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
num_proteins = length(protein_indices)
Expand Down Expand Up @@ -285,7 +286,7 @@ MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_sym
#' head(summarized[[1]][[1]]) # run-level summary
#'
MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol,
remove50missing, equal_variance, aft_iterations) {
remove50missing, equal_variance, aft_iterations = 90) {


protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
Expand Down Expand Up @@ -419,7 +420,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
impute,
censored_symbol,
remove50missing,
aft_iterations,
aft_iterations = 90,
equal_variances = TRUE) {
ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL

Expand Down Expand Up @@ -523,7 +524,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
#' head(single_protein_summary[[1]])
#'
MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
remove50missing, aft_iterations) {
remove50missing, aft_iterations = 90) {
newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
predicted = censored = NULL
cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
Expand Down
12 changes: 10 additions & 2 deletions R/utils_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,23 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
#' @param feature_selection list with elements: remove_uninformative
#' @param summarization list with elements: method.
#' @param imputation list with elements: cutoff, symbol.
#' @param input_columns character vector of input columns
#' @keywords internal
.checkDataProcessParams = function(log_base, normalization_method,
standards_names, feature_selection,
summarization, imputation) {
summarization, imputation, input_columns) {
checkmate::assertChoice(log_base, c(2, 10), .var.name = "logTrans")
checkmate::assertChoice(summarization$method, c("linear", "TMP"),
.var.name = "summaryMethod")
.var.name = "summaryMethod")
getOption("MSstatsLog")("INFO", paste("Summary method:",
summarization$method))
if ("AnomalyScores" %in% input_columns) {
if (summarization$method != "linear") {
stop("AnomalyScores column detected in your input columns. ",
"Please set summaryMethod to 'linear' to use anomaly scores for protein summarization, ",
"or remove the AnomalyScores column from your input data.")
}
}
checkmate::assertChoice(imputation$symbol, c("0", "NA"),
null.ok = TRUE, .var.name = "censoredInt")
getOption("MSstatsLog")("INFO", paste("censoredInt:", imputation$symbol))
Expand Down
185 changes: 184 additions & 1 deletion inst/tinytest/test_dataProcess.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,187 @@ msstats_input_fractions_techreps = data.table::fread(
QuantDataTechRepsFractions = dataProcess(msstats_input_fractions_techreps,
use_log_file = FALSE)
expect_true(!is.null(QuantDataTechRepsFractions))
expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)
expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)

# Test dataProcess with linear summary method and anomaly scores --------------
msstats_input = data.table::data.table(
ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)),
PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5),
rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5),
rep("LowAbundancePeptide", 5)),
PrecursorCharge = rep(2, 25),
FragmentIon = rep("y3", 25),
ProductCharge = rep(1, 25),
IsotopeLabelType = rep("L", 25),
Condition = rep(c("A", "A", "A", "B", "B"), 5),
BioReplicate = rep(seq(1:5), 5),
Run = rep(paste0("Run", seq(1:5)), 5),
Intensity = c(1000, 1500, 2000, 2500, 3000,
1100, 1600, 2100, 2600, 3100,
1200, 1700, 2200, 2000, 1700,
1300, 1800, 1200, 2800, 1800,
100, 200, 300, 400, 500),
AnomalyScores = c(rep(0.03, 10), c(0.01,0.01,0.01,0.9,0.8), c(0.01,0.01,0.01,0.7,0.5), rep(0.10, 5))
)

# Must run with linear summarization
expect_error(
dataProcess(msstats_input, summaryMethod="TMP"),
pattern = "AnomalyScores column detected in your input columns.*set summaryMethod to 'linear'",
info = "Should error when AnomalyScores is present but summaryMethod is not 'linear'"
)

# Process data with linear summarization
QuantDataLinearAnomaly = dataProcess(msstats_input,
normalization=FALSE, # no normalization
MBimpute=TRUE, # Use imputation
summaryMethod="linear" # Key MSstats+ parameter
)

expect_true("Variance" %in% colnames(QuantDataLinearAnomaly$ProteinLevelData),
info = "Variance column should be present in ProteinLevelData")
protein_data = QuantDataLinearAnomaly$ProteinLevelData
expect_true(all(protein_data$Variance > 0, na.rm = TRUE),
info = "All variance values should be positive")
expect_true(all(is.finite(protein_data$Variance), na.rm = TRUE),
info = "All variance values should be finite")
variance_values = protein_data[protein_data$Protein == "Q9UFW8"]$Variance
expect_true(all(variance_values == variance_values[1]),
info = paste("Not all variance values are equal for Q9UFW8"))

# Create comparison dataset with low anomaly scores for contrast
msstats_input_low_anomaly = data.table::data.table(
ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)),
PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5),
rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5),
rep("LowAbundancePeptide", 5)),
PrecursorCharge = rep(2, 25),
FragmentIon = rep("y3", 25),
ProductCharge = rep(1, 25),
IsotopeLabelType = rep("L", 25),
Condition = rep(c("A", "A", "A", "B", "B"), 5),
BioReplicate = rep(seq(1:5), 5),
Run = rep(paste0("Run", seq(1:5)), 5),
Intensity = c(1000, 1500, 2000, 2500, 3000,
1100, 1600, 2100, 2600, 3100,
1200, 1700, 2200, 2000, 1700,
1300, 1800, 1200, 2800, 1800,
100, 200, 300, 400, 500),
AnomalyScores = rep(0.01, 25) # All low anomaly scores
)

QuantDataLowAnomaly = dataProcess(msstats_input_low_anomaly,
normalization=FALSE,
MBimpute=TRUE,
summaryMethod="linear")
high_anomaly_variance = protein_data$Variance
low_anomaly_variance = QuantDataLowAnomaly$ProteinLevelData$Variance
expect_true(mean(high_anomaly_variance, na.rm = TRUE) > mean(low_anomaly_variance, na.rm = TRUE),
info = "Mean variance should be higher for dataset with high anomaly scores")


# Verify that without AnomalyScores, different behavior occurs
msstats_input_no_anomaly = msstats_input[, !c("AnomalyScores")]
QuantDataNoAnomaly = dataProcess(msstats_input_no_anomaly,
normalization=FALSE,
MBimpute=TRUE,
summaryMethod="linear")
no_anomaly_variance = QuantDataNoAnomaly$ProteinLevelData$Variance
expect_false(identical(high_anomaly_variance, no_anomaly_variance),
info = "Variance should be different when AnomalyScores are present vs absent")



# Test MSstatsSummarizeSingleLinear function ------------------------------

single_protein = data.table::data.table(
PROTEIN = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19" ),
PEPTIDE = c( "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2", "AFPLAEWQPSDVDQR_2", "ASGLLLER_2", "LowAbundancePeptide_2" ),
TRANSITION = c( "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1", "y3_1" ),
FEATURE = c( "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1", "AFPLAEWQPSDVDQR_2_y3_1", "ASGLLLER_2_y3_1", "LowAbundancePeptide_2_y3_1" ),
LABEL = c( "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L" ),
GROUP_ORIGINAL = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ),
SUBJECT_ORIGINAL = c( 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5 ),
RUN = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ),
GROUP = c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B" ),
SUBJECT = c( "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4", "4", "5", "5", "5" ),
FRACTION = c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ),
INTENSITY = c( 1200, 1300, 100, 1700, 1800, 200, 2200, 1200, 300, 2000, 2800, 400, 1700, 1800, 500 ),
ANOMALYSCORES = c( 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.01, 0.01, 0.1, 0.9, 0.7, 0.1, 0.8, 0.5, 0.1 ),
ABUNDANCE = c( 10.229, 10.344, 6.644, 10.731, 10.814, 7.644, 11.103, 10.229, 8.229, 10.966, 11.451, 8.644, 10.731, 10.814, 8.966 ),
originalRUN = c( "Run1", "Run1", "Run1", "Run2", "Run2", "Run2", "Run3", "Run3", "Run3", "Run4", "Run4", "Run4", "Run5", "Run5", "Run5" ),
censored = c( FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE ),
newABUNDANCE = c( 10.229, 10.344, NA, 10.731, 10.814, NA, 11.103, 10.229, NA, 10.966, 11.451, NA, 10.731, 10.814, NA ),
cen = c( 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0 ),
nonmissing = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ),
n_obs = c( 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0, 5, 5, 0 ),
n_obs_run = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ),
total_features = c( 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ),
prop_features = c( 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667, 0.667 ),
nonmissing_all = c( TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE ),
ABUNDANCE_cut = c( 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA, 10.127, 10.127, NA ),
any_censored = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE )
)

summarized = MSstats::MSstatsSummarizeSingleLinear(
single_protein,
impute = FALSE,
censored_symbol = "NA",
remove50missing = FALSE,
aft_iterations = 90,
equal_variances = TRUE
)

protein_level_summary = summarized[[1]]
feature_level_summary = summarized[[2]]

## Basic tests
expect_equal(nrow(protein_level_summary), 5)
expect_equal(ncol(protein_level_summary), 4)
expect_true(all(protein_level_summary$Protein == "Q96S19"))
expect_equal(as.numeric(protein_level_summary$RUN), 1:5)
expect_equal(nrow(feature_level_summary), 10)
expect_equal(ncol(feature_level_summary), 5)
expect_equal(length(unique(feature_level_summary$FEATURE)), 2)
expect_equal(length(unique(feature_level_summary$RUN)), 5)

## Verify that variance is higher in runs 4-5 with high anomaly scores
variance_runs_1_3 = protein_level_summary[RUN %in% 1:3, Variance]
variance_runs_4_5 = protein_level_summary[RUN %in% 4:5, Variance]

for (var_45 in variance_runs_4_5) {
for (var_13 in variance_runs_1_3) {
expect_true(var_45 > var_13,
info = paste("Variance in runs 4-5 should be > variance in runs 1-3:",
var_45, ">", var_13))
}
}

mean_variance_1_3 = mean(variance_runs_1_3)
mean_variance_4_5 = mean(variance_runs_4_5)

expect_true(mean_variance_4_5 > 10 * mean_variance_1_3,
info = paste("Mean variance of runs 4-5 should be at least 10x higher:",
mean_variance_4_5, "vs", 10 * mean_variance_1_3))

## Verify that protein levels in runs 4-5 are closer to ASGLLLER than AFPLAEWQPSDVDQR
protein_runs_4_5 = protein_level_summary[RUN %in% 4:5]
afpla_runs_4_5 = feature_level_summary[FEATURE == "AFPLAEWQPSDVDQR_2_y3_1" & RUN %in% 4:5]
asgll_runs_4_5 = feature_level_summary[FEATURE == "ASGLLLER_2_y3_1" & RUN %in% 4:5]
for (i in 1:nrow(protein_runs_4_5)) {
run_num = protein_runs_4_5$RUN[i]
protein_logint = protein_runs_4_5$LogIntensities[i]

afpla_abundance = afpla_runs_4_5[RUN == run_num, newABUNDANCE]
asgll_abundance = asgll_runs_4_5[RUN == run_num, newABUNDANCE]

dist_to_afpla = abs(protein_logint - afpla_abundance)
dist_to_asgll = abs(protein_logint - asgll_abundance)

expect_true(dist_to_asgll < dist_to_afpla,
info = paste("Run", run_num, ": Protein LogIntensity", protein_logint,
"should be closer to ASGLLLER", asgll_abundance,
"(dist =", round(dist_to_asgll, 4), ")",
"than to AFPLAEWQPSDVDQR", afpla_abundance,
"(dist =", round(dist_to_afpla, 4), ")"))
}
45 changes: 44 additions & 1 deletion inst/tinytest/test_groupComparison.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,47 @@ testResultParallelComparison = groupComparison(contrast.matrix=comparison,
numberOfCores = 2)

expect_equal(nrow(testResultDefaultComparison$ComparisonResult),
nrow(testResultParallelComparison$ComparisonResult))
nrow(testResultParallelComparison$ComparisonResult))

# Test groupComparison with weights
protein_level_data_with_anomalies = data.table::data.table(
RUN = c( "1", "2", "3", "4", "5", "1", "2", "3", "4", "5" ),
Protein = c( "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q96S19", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8", "Q9UFW8" ),
LogIntensities = c(10.28, 10.39, 10.63, 11.26, 11.25, 10.03, 10.59, 11.00, 11.31, 11.57),
Variance = c( 0.0020, 0.0020, 0.15, 0.13, 0.0020, rep(0.00032, 5)),
originalRUN = c( "Run1", "Run2", "Run3", "Run4", "Run5", "Run1", "Run2", "Run3", "Run4", "Run5" ),
GROUP = c( "A", "A", "A", "B", "B", "A", "A", "A", "B", "B" ),
SUBJECT = c( 1, 2, 3, 4, 5, 1, 2, 3, 4, 5 ),
TotalGroupMeasurements = c( 9, 9, 9, 6, 6, 6, 6, 6, 4, 4 ),
NumMeasuredFeature = c( 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ),
MissingPercentage = c( 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0.333333333333333, 0, 0, 0, 0, 0 ),
more50missing = c( FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE ),
NumImputedFeature = c( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
)
protein_level_data_no_anomalies = protein_level_data_with_anomalies
protein_level_data_no_anomalies$Variance = NULL
QuantDataNoAnomaly = list(ProteinLevelData = protein_level_data_no_anomalies)
QuantDataWithAnomaly = list(ProteinLevelData = protein_level_data_with_anomalies)
model_no_anomaly = groupComparison("pairwise", QuantDataNoAnomaly,
use_log_file = FALSE)
model_with_anomaly = groupComparison("pairwise", QuantDataWithAnomaly,
use_log_file = FALSE)

no_anomaly_Q9UFW8 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q9UFW8", ]
with_anomaly_Q9UFW8 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q9UFW8", ]

expect_true(nrow(no_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_no_anomaly")
expect_true(nrow(with_anomaly_Q9UFW8) > 0, info = "Q9UFW8 not found in model_with_anomaly")
expect_equal(no_anomaly_Q9UFW8$SE, with_anomaly_Q9UFW8$SE,
info = "SE values for Q9UFW8 should be the same")
expect_equal(no_anomaly_Q9UFW8$pvalue, with_anomaly_Q9UFW8$pvalue,
info = "pvalue values for Q9UFW8 should be the same")
no_anomaly_Q96S19 = model_no_anomaly$ComparisonResult[model_no_anomaly$ComparisonResult$Protein == "Q96S19", ]
with_anomaly_Q96S19 = model_with_anomaly$ComparisonResult[model_with_anomaly$ComparisonResult$Protein == "Q96S19", ]
expect_true(nrow(no_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_no_anomaly")
expect_true(nrow(with_anomaly_Q96S19) > 0, info = "Q96S19 not found in model_with_anomaly")
expect_true(with_anomaly_Q96S19$SE < no_anomaly_Q96S19$SE,
info = "SE for Q96S19 should be lower in model_with_anomaly")
expect_true(with_anomaly_Q96S19$pvalue < no_anomaly_Q96S19$pvalue,
info = "pvalue for Q96S19 should be lower in model_with_anomaly")

18 changes: 18 additions & 0 deletions inst/tinytest/test_utils_groupComparison_model.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Test for .fitModelForGroupComparison function
test_input_with_variance = data.frame(
ABUNDANCE = c(1.2, 1.5, 1.8, 2.1, 1.9, 2.3),
GROUP = factor(c("A", "A", "A", "B", "B", "B")),
SUBJECT = factor(c("S1", "S2", "S3", "S1", "S2", "S3")),
Variance = c(0.1, 0.15, 0.08, 0.12, 0.09, 0.11)
)
result = MSstats:::.fitModelForGroupComparison(
input = test_input_with_variance,
repeated = FALSE,
is_single_subject = FALSE,
has_tech_replicates = FALSE
)
expect_false(is.null(result$full_fit$weights),
"Fitted model should contain weights when Variance is provided")
expected_weights = 1/test_input_with_variance$Variance
expect_equal(result$full_fit$weights, expected_weights,
info = "Model weights should match 1/Variance")
Loading