diff --git a/R/converters_SpectronauttoMSstatsFormat.R b/R/converters_SpectronauttoMSstatsFormat.R index 6a6add22..ff383d0c 100644 --- a/R/converters_SpectronauttoMSstatsFormat.R +++ b/R/converters_SpectronauttoMSstatsFormat.R @@ -8,7 +8,7 @@ #' @param qvalue_cutoff Cutoff for EG.Qvalue. default is 0.01. #' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis. #' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE. -#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_incrase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). +#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL). #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE. #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE. #' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal. @@ -45,6 +45,34 @@ SpectronauttoMSstatsFormat = function( use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL, ... ) { + validation_config = list( + input = input, + annotation = annotation, + intensity = intensity, + excludedFromQuantificationFilter = excludedFromQuantificationFilter, + filter_with_Qvalue = filter_with_Qvalue, + qvalue_cutoff = qvalue_cutoff, + useUniquePeptide = useUniquePeptide, + removeFewMeasurements = removeFewMeasurements, + removeProtein_with1Feature = removeProtein_with1Feature, + summaryforMultipleRows = summaryforMultipleRows, + calculateAnomalyScores = calculateAnomalyScores, + anomalyModelFeatures = anomalyModelFeatures, + anomalyModelFeatureTemporal = anomalyModelFeatureTemporal, + removeMissingFeatures = removeMissingFeatures, + anomalyModelFeatureCount = anomalyModelFeatureCount, + runOrder = runOrder, + n_trees = n_trees, + max_depth = max_depth, + numberOfCores = numberOfCores, + use_log_file = use_log_file, + append = append, + verbose = verbose, + log_file_path = log_file_path + ) + + .validateMSstatsConverterParameters(validation_config) + MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path) diff --git a/R/utils_MSstatsConvert.R b/R/utils_MSstatsConvert.R index a3e338df..da2bd1f5 100644 --- a/R/utils_MSstatsConvert.R +++ b/R/utils_MSstatsConvert.R @@ -11,7 +11,6 @@ #' \code{\link{MSstatsBalancedDesign}} for handling fractions and creating balanced data. #' #' @import data.table -#' @docType _PACKAGE #' @name MSstatsConvert -#' -NULL +#' @keywords internal +"_PACKAGE" diff --git a/R/utils_balanced_design.R b/R/utils_balanced_design.R index 81fe1ae8..89f8b971 100644 --- a/R/utils_balanced_design.R +++ b/R/utils_balanced_design.R @@ -1,10 +1,12 @@ #' Fill missing rows to create balanced design #' @param input output of `MSstatsPreprocess` #' @param fill_missing if TRUE, missing Intensities values will be added to data +#' @param anomaly_metrics character vector of quality metric column names to be +#' used as features in an anomaly detection model. #' and marked as NA #' @return data.table #' @keywords internal -.makeBalancedDesign = function(input, fill_missing, anomaly_metrics) { +.makeBalancedDesign = function(input, fill_missing, anomaly_metrics = c()) { feature = NULL is_tmt = is.element("Channel", colnames(input)) diff --git a/R/utils_checks.R b/R/utils_checks.R index c8879c95..7d1458e3 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -46,3 +46,208 @@ chosen } } + + +#' Generic parameter validation for all MSstats converters using configuration object +#' @param config A list containing all converter parameters. See details for required structure. +#' @details +#' The config list should contain the input and optionally other parameters: +#' - input: input data (required) +#' - annotation: annotation data (optional) +#' - intensity: intensity type (optional) +#' - filter_with_Qvalue: Q-value filter setting (default: FALSE) +#' - qvalue_cutoff: Q-value cutoff (default: 0.01) +#' - useUniquePeptide: unique peptide setting (default: TRUE) +#' - removeFewMeasurements: remove few measurements setting (default: TRUE) +#' - removeProtein_with1Feature: remove single feature proteins setting (default: FALSE) +#' - summaryforMultipleRows: aggregation function (default: max) +#' - calculateAnomalyScores: anomaly detection setting (default: FALSE) +#' - anomalyModelFeatures: anomaly model features (default: c()) +#' - anomalyModelFeatureTemporal: temporal features (default: c()) +#' - removeMissingFeatures: missing feature threshold (default: 0.5) +#' - anomalyModelFeatureCount: feature count for anomaly model (default: 100) +#' - runOrder: run order data (default: NULL) +#' - n_trees: number of trees (default: 100) +#' - max_depth: max tree depth (default: "auto") +#' - numberOfCores: number of cores (default: 1) +#' - use_log_file: logging setting (default: TRUE) +#' - append: append setting (default: FALSE) +#' - verbose: verbose setting (default: TRUE) +#' - log_file_path: log file path (default: NULL) +#' - excludedFromQuantificationFilter: filter setting (default: NULL) +#' @return NULL (throws error if validation fails) +#' @keywords internal +.validateMSstatsConverterParameters = function(config) { + + # Ensure config is a list + if (!is.list(config)) { + stop("Config must be a list") + } + + # Define all defaults in one place + defaults = list( + input = NULL, + annotation = NULL, + intensity = NULL, + excludedFromQuantificationFilter = NULL, + filter_with_Qvalue = FALSE, + qvalue_cutoff = 0.01, + useUniquePeptide = TRUE, + removeFewMeasurements = TRUE, + removeProtein_with1Feature = FALSE, + summaryforMultipleRows = max, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c(), + anomalyModelFeatureTemporal = c(), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = NULL, + n_trees = 100, + max_depth = "auto", + numberOfCores = 1, + use_log_file = TRUE, + append = FALSE, + verbose = TRUE, + log_file_path = NULL + ) + + # Merge config with defaults (config values override defaults) + config = modifyList(defaults, config) + + # Input data validation (fail immediately if data is invalid) + if (is.null(config$input)) { + stop("Input data cannot be NULL") + } + + if (is.character(config$input)) { + if (!file.exists(config$input)) { + stop("Input file does not exist: ", config$input) + } + # Quick file size check for very large files + file_size_mb = file.size(config$input) / (1024^2) + if (file_size_mb > 1000) { # Warn for files > 1GB + warning("Large input file detected (", round(file_size_mb, 1), " MB). ", + "Consider validating parameters on a subset first.") + } + } else if (is.data.frame(config$input) || data.table::is.data.table(config$input)) { + # Quick structural validation + if (nrow(config$input) == 0) { + stop("Input data is empty (0 rows)") + } + if (ncol(config$input) == 0) { + stop("Input data has no columns") + } + } else { + stop("Input must be a file path, data.frame, or data.table") + } + + # Annotation validation + if (!is.null(config$annotation)) { + if (is.character(config$annotation)) { + if (!file.exists(config$annotation)) { + stop("Annotation file does not exist: ", config$annotation) + } + } else if (!is.data.frame(config$annotation) && !data.table::is.data.table(config$annotation)) { + stop("Annotation must be NULL, a file path, data.frame, or data.table") + } + } + + # Intensity validation (if provided) + if (!is.null(config$intensity)) { + checkmate::assertString(config$intensity) + } + + # Q-value filtering parameters + checkmate::assertLogical(config$filter_with_Qvalue, len = 1) + checkmate::assertNumber(config$qvalue_cutoff, lower = 0, upper = 1) + + # Common processing parameters + checkmate::assertLogical(config$useUniquePeptide, len = 1) + checkmate::assertLogical(config$removeFewMeasurements, len = 1) + checkmate::assertLogical(config$removeProtein_with1Feature, len = 1) + + # Aggregation function validation + if (!is.null(config$summaryforMultipleRows)) { + checkmate::assertFunction(config$summaryforMultipleRows) + } + + # Core system parameters + checkmate::assertInt(config$numberOfCores, lower = 1) + checkmate::assertLogical(config$use_log_file, len = 1) + checkmate::assertLogical(config$append, len = 1) + checkmate::assertLogical(config$verbose, len = 1) + + # Converter-specific boolean parameters (if provided) + if (!is.null(config$excludedFromQuantificationFilter)) { + checkmate::assertLogical(config$excludedFromQuantificationFilter, len = 1) + } + + checkmate::assertLogical(config$calculateAnomalyScores, len = 1) + + # Anomaly detection parameter validation (converter-specific) + if (config$calculateAnomalyScores) { + # These validations only matter if anomaly detection is enabled + if (length(config$anomalyModelFeatures) == 0) { + stop("anomalyModelFeatures cannot be empty when calculateAnomalyScores=TRUE") + } + checkmate::assertCharacter(config$anomalyModelFeatures, min.len = 1) + + if (length(config$anomalyModelFeatureTemporal) > 0) { + if (length(config$anomalyModelFeatureTemporal) != length(config$anomalyModelFeatures)) { + stop("anomalyModelFeatureTemporal must have same length as anomalyModelFeatures or be empty") + } + valid_temporal = c("mean_decrease", "mean_increase", "dispersion_increase") + invalid_temporal = config$anomalyModelFeatureTemporal[ + !is.null(config$anomalyModelFeatureTemporal) & + !config$anomalyModelFeatureTemporal %in% valid_temporal + ] + if (length(invalid_temporal) > 0) { + stop("Invalid temporal directions: ", paste(invalid_temporal, collapse = ", "), + ". Must be one of: ", paste(valid_temporal, collapse = ", "), " or NULL") + } + } + + checkmate::assertInt(config$n_trees, lower = 1) + if (is.character(config$max_depth)) { + checkmate::assertChoice(config$max_depth, choices = "auto") + } else { + checkmate::assertInt(config$max_depth, lower = 1) + } + checkmate::assertInt(config$anomalyModelFeatureCount, lower = 1) + checkmate::assertNumber(config$removeMissingFeatures, lower = 0, upper = 1) + + if (!is.null(config$runOrder)) { + if (!is.data.frame(config$runOrder) && !data.table::is.data.table(config$runOrder)) { + stop("runOrder must be a data.frame or data.table") + } + required_cols = c("Run", "Order") + missing_cols = setdiff(required_cols, colnames(config$runOrder)) + if (length(missing_cols) > 0) { + stop("runOrder is missing required columns: ", paste(missing_cols, collapse = ", ")) + } + if (!is.numeric(config$runOrder$Order)) { + stop("runOrder$Order must be numeric") + } + } + } else { + # When anomaly detection is disabled, these parameters are ignored but warn user + if (length(config$anomalyModelFeatures) > 0) { + warning("anomalyModelFeatures provided but calculateAnomalyScores=FALSE, ignoring") + } + if (!is.null(config$runOrder)) { + warning("runOrder provided but calculateAnomalyScores=FALSE, ignoring") + } + } + + # Log file validation + if (!is.null(config$log_file_path)) { + checkmate::assertString(config$log_file_path) + log_dir = dirname(config$log_file_path) + if (!dir.exists(log_dir)) { + stop("Log file directory does not exist: ", log_dir) + } + if (!file.access(log_dir, mode = 2) == 0) { + stop("No write permission for log file directory: ", log_dir) + } + } +} \ No newline at end of file diff --git a/R/utils_classes.R b/R/utils_classes.R index 2458fd08..edd3ff57 100644 --- a/R/utils_classes.R +++ b/R/utils_classes.R @@ -5,10 +5,11 @@ setOldClass("MSstatsValidated", S4Class = "MSstatsValidated") #' Output format for further analysis by MSstats #' @param input data.table +#' @param anomaly_metrics character vector of quality metric column names to be used as features in an anomaly detection model #' @importFrom methods new #' @return object of class MSstatsValidated that inherits from data.frame #' @keywords internal -.MSstatsFormat = function(input, anomaly_metrics) { +.MSstatsFormat = function(input, anomaly_metrics = c()) { input = .selectMSstatsColumns(input, anomaly_metrics) new("MSstatsValidated", as.data.frame(input)) } diff --git a/R/utils_clean_features.R b/R/utils_clean_features.R index e98ffe27..8ba1045f 100644 --- a/R/utils_clean_features.R +++ b/R/utils_clean_features.R @@ -3,10 +3,12 @@ #' @param feature_columns character vector of names of columns that define features. #' @param cleaning_control named list of two or three elements. #' See the documentation for `MSstatsImport` for details. +#' @param anomaly_metrics character vector of quality metric column names to be +#' used as features in an anomaly detection model. #' @return `data.table` #' @keywords internal .cleanByFeature = function(input, feature_columns, - cleaning_control, anomaly_metrics) { + cleaning_control, anomaly_metrics = c()) { if (is.element("Channel", colnames(input))) { input = .filterFewMeasurements( input, 0, @@ -86,7 +88,9 @@ #' Summarize multiple measurements per feature in a single run #' @param input `data.table` pre-processed by one of the .cleanRaw* functions. #' @param aggregator function that will be used to aggregate duplicated values. -#' @param feature_columns chr, vector of names of columns that define features. +#' @param feature_columns chr, vector of names of columns that define features. +#' @param anomaly_metrics character vector of quality metric column names +#' to be used as features in an anomaly detection model. #' @return `data.table` #' @keywords internal .summarizeMultipleMeasurements = function(input, aggregator, diff --git a/inst/tinytest/test_cleanRaw.R b/inst/tinytest/test_cleanRaw.R index 94a21fc9..a94889f8 100644 --- a/inst/tinytest/test_cleanRaw.R +++ b/inst/tinytest/test_cleanRaw.R @@ -193,22 +193,31 @@ expect_true(nrow(sm_cleaned) > 0) expect_error(MSstatsConvert::MSstatsClean(spectromine_import_error)) # Spectronaut spectronaut_input = data.table::fread("./raw_data/Spectronaut/spectronaut_input.csv") -spectronaut_input2 = data.table::copy(spectronaut_input) -spectronaut_input2$F.ExcludedFromQuantification = ifelse( - spectronaut_input2$F.ExcludedFromQuantification, - "True", "False" -) spectronaut_import = MSstatsConvert::MSstatsImport(list(input = spectronaut_input), "MSstats", "Spectronaut") -spectronaut_import2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input2), - "MSstats", "Spectronaut") sn_cleaned = MSstatsConvert::MSstatsClean(spectronaut_import, - intensity = "PeakArea") -sn_cleaned2 = MSstatsConvert::MSstatsClean(spectronaut_import2, - intensity = "PeakArea") -expect_equal(ncol(sn_cleaned), 11) + intensity = "PeakArea", + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c()) +expect_equal(ncol(sn_cleaned), 12) expect_true(nrow(sn_cleaned) > 0) -expect_equal(sn_cleaned, sn_cleaned2) + +# Test new peak quality columns +spectronaut_input_2 = spectronaut_input +spectronaut_input_2$`FG.ShapeQualityScore (MS2)` = 1 +spectronaut_input_2$`FG.ShapeQualityScore (MS1)` = 1 +spectronaut_input_2$`EG.ApexRT` = 1 +spectronaut_input_2$`F.PossibleInterference` = TRUE +spectronaut_import_2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input_2), + "MSstats", "Spectronaut") +sn_cleaned_2 = MSstatsConvert::MSstatsClean(spectronaut_import_2, + intensity = "PeakArea", + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FGShapeQualityScore(MS2)", + "FGShapeQualityScore(MS1)", + "EGApexRT")) +expect_equal(ncol(sn_cleaned_2), 16) +expect_true(nrow(sn_cleaned_2) > 0) # Metamorpheus metamorpheus_table = data.table::fread("./raw_data/Metamorpheus/QuantifiedPeaks.tsv") diff --git a/inst/tinytest/test_converters_SpectronauttoMSstatsFormat.R b/inst/tinytest/test_converters_SpectronauttoMSstatsFormat.R index 6aee95d0..09107563 100644 --- a/inst/tinytest/test_converters_SpectronauttoMSstatsFormat.R +++ b/inst/tinytest/test_converters_SpectronauttoMSstatsFormat.R @@ -55,3 +55,321 @@ expect_error( "The following columns are missing from the input data: FGCharge" ) + +# Test SpectronauttoMSstatsFormat Quality Metrics --------------------------- +spectronaut_raw = system.file("tinytest/raw_data/Spectronaut/spectronaut_input.csv", + package = "MSstatsConvert") +spectronaut_raw = data.table::fread(spectronaut_raw) +spectronaut_raw$`FG.ShapeQualityScore (MS2)` = 1 +spectronaut_raw$`FG.ShapeQualityScore (MS1)` = 1 +spectronaut_raw$`EG.ApexRT` = 1 +spectronaut_raw$`F.PossibleInterference` = TRUE +temporal = spectronaut_raw[, .(R.Replicate, R.Condition), by = R.FileName] +temporal = unique(temporal) +data.table::setnames(temporal, "R.FileName", "Run") +temporal$Order = seq(1:nrow(temporal)) +temporal = temporal[, c("Run", "Order")] + +msstats_format = SpectronauttoMSstatsFormat( + spectronaut_raw, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +) + +expect_true("Run" %in% colnames(msstats_format)) +expect_true("ProteinName" %in% colnames(msstats_format)) +expect_true("PeptideSequence" %in% colnames(msstats_format)) +expect_true("PrecursorCharge" %in% colnames(msstats_format)) +expect_true("Intensity" %in% colnames(msstats_format)) +expect_true("FragmentIon" %in% colnames(msstats_format)) +expect_true("ProductCharge" %in% colnames(msstats_format)) +expect_true("IsotopeLabelType" %in% colnames(msstats_format)) +expect_true("Condition" %in% colnames(msstats_format)) +expect_true("BioReplicate" %in% colnames(msstats_format)) +expect_true("Fraction" %in% colnames(msstats_format)) +expect_true("AnomalyScores" %in% colnames(msstats_format)) +expect_true("FGShapeQualityScore(MS2)" %in% colnames(msstats_format)) +expect_true("FGShapeQualityScore(MS1)" %in% colnames(msstats_format)) +expect_true("EGApexRT" %in% colnames(msstats_format)) +expect_true("FGShapeQualityScore(MS2).mean_decrease" %in% colnames(msstats_format)) +expect_true("FGShapeQualityScore(MS1).mean_increase" %in% colnames(msstats_format)) +expect_true("EGApexRT.dispersion_increase" %in% colnames(msstats_format)) + + +# Spectronaut quality features error handling ------------------------------- + +# runOrder not provided +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = NULL, + n_trees = 100, + max_depth = "auto" +)) + +# runOrder is not a data frame +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = seq(1:nrow(annotation)), + n_trees = 100, + max_depth = "auto" +)) + +# n_trees is negative +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = -3, + max_depth = "auto" +)) + +# FAIL n_trees is a decimal +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100.5, + max_depth = "auto" +)) + +# n_trees is a string +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = "string", + max_depth = "auto" +)) + +# max_depth is a string +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "string" +)) + +# FAIL max_depth is a decimal +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = 5.5 +)) + +# FAIL max_depth is a negative number +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = -5 +)) + +# anomalyModelFeatures is empty +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures = c(), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# anomalyModelFeatureTemporal contains unrecognized string +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("string", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# anomalyModelFeatureTemporal is empty +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c(), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# FAIL removeMissingFeatures is a percentage above 1 +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 1.5, + anomalyModelFeatureCount = 100, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# FAIL anomalyModelFeatureCount is not an integer +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = 100.5, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# FAIL anomalyModelFeatureCount is a string +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = "string", + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) + +# anomalyModelFeatureCount is a negative number +expect_error(SpectronauttoMSstatsFormat( + spectronaut_raw, + annotation = annotation, + calculateAnomalyScores = TRUE, + anomalyModelFeatures =c("FG.ShapeQualityScore (MS2)", + "FG.ShapeQualityScore (MS1)", + "EG.ApexRT"), + anomalyModelFeatureTemporal = c("mean_decrease", + "mean_increase", + "dispersion_increase"), + removeMissingFeatures = 0.5, + anomalyModelFeatureCount = -5, + runOrder = temporal, + n_trees = 100, + max_depth = "auto" +)) diff --git a/inst/tinytest/test_utils_anomaly_score.R b/inst/tinytest/test_utils_anomaly_score.R new file mode 100644 index 00000000..80dd0a91 --- /dev/null +++ b/inst/tinytest/test_utils_anomaly_score.R @@ -0,0 +1,369 @@ +# Comprehensive Unit Tests for Anomaly Detection Function using tinytest + +# Helper function to create test data with quality metrics +run_quality_metrics = function(df, mean_increase, mean_decrease, dispersion_increase) { + df$QualityMetric.mean_increase = mean_increase + df$QualityMetric.mean_decrease = mean_decrease + df$QualityMetric.dispersion_increase = dispersion_increase + anomaly_output = MSstatsConvert:::.runAnomalyModel(df, + n_trees=100, + max_depth="auto", + cores=1, + split_column="PSM", + quality_metrics=c("QualityMetric.mean_increase", + "QualityMetric.mean_decrease", + "QualityMetric.dispersion_increase")) + return(anomaly_output) +} + +# Create base dataframe for testing +create_base_df = function(n_rows) { + data.table::data.table( + PSM = rep("PSM1", n_rows) + ) +} + +# Test 1: Property-Based Testing - Monotonicity with Cumulative Sums +base_df_10 = create_base_df(10) + +# Baseline data with low values +baseline_scores = run_quality_metrics( + base_df_10, + rep(0.1, 10), # mean_increase + rep(0.1, 10), # mean_decrease + rep(0.1, 10) # dispersion_increase +) + +# Data with progressively higher cumulative sums +high_scores = run_quality_metrics( + base_df_10, + c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_increase + c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)), # mean_decrease + c(rep(0.1, 5), seq(2.0, 5.0, length.out = 5)) # dispersion_increase +) + +# The last 5 rows (with high values) should have higher mean anomaly scores +expect_true(mean(high_scores$AnomalyScores[6:10]) > mean(high_scores$AnomalyScores[1:5]), + info = "Higher cumulative sum values should produce higher anomaly scores") + +# Test 2: Extreme Value Testing - Obvious Outliers +base_df_20 = create_base_df(20) + +extreme_scores = run_quality_metrics( + base_df_20, + c(rep(0.1, 19), 10.0), # Last value is extreme + c(rep(0.1, 19), 8.0), # Last value is extreme + c(rep(0.1, 19), 12.0) # Last value is extreme +) + +# The extreme outlier (last row) should have the highest anomaly score +expect_true(extreme_scores$AnomalyScores[20] == max(extreme_scores$AnomalyScores), + info = "Extreme outlier should have highest anomaly score") + +# The outlier should score significantly higher than the median +expect_true(extreme_scores$AnomalyScores[20] > median(extreme_scores$AnomalyScores[1:19]) * 2, + info = "Outlier should score significantly higher than median") + +# Test 3: Consistency/Reproducibility Testing +base_df_20_orig = create_base_df(20) + +# Run multiple times with original test data +scores1 = run_quality_metrics( + base_df_20_orig, + c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94), + c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00), + c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91) +) + +scores2 = run_quality_metrics( + base_df_20_orig, + c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94), + c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00), + c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91) +) + +# Correlation between runs should be high (>0.7) even if not identical +correlation = cor(scores1$AnomalyScores, scores2$AnomalyScores) +expect_true(correlation > 0.95, + info = "Multiple runs should produce correlated results") + +# Top anomalies should be largely consistent +top5_run1 = order(scores1$AnomalyScores, decreasing = TRUE)[1:5] +top5_run2 = order(scores2$AnomalyScores, decreasing = TRUE)[1:5] +overlap = length(intersect(top5_run1, top5_run2)) +expect_true(overlap >= 3, + info = "At least 3 of top 5 anomalies should overlap between runs") + +# Test 4: Expected Anomaly Identification from Original Data +original_anomaly_scores = run_quality_metrics( + base_df_20_orig, + c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94), + c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00), + c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91) +) + +# Row 20 has highest mean_increase (2.94) - should be highly anomalous +top_anomalies = order(original_anomaly_scores$AnomalyScores, decreasing = TRUE)[1:5] +expect_true(20 %in% top_anomalies, + info = "Row 20 with highest mean_increase should be in top anomalies") + +# Create test data vectors for comparison +mean_increase_vals = c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94) +mean_decrease_vals = c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00) +dispersion_vals = c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91) + +# Rows with all zeros should score lower than rows with high values +zero_rows = which(mean_increase_vals == 0 & mean_decrease_vals == 0 & dispersion_vals == 0) +high_value_rows = c(14, 16, 17, 18, 19, 20) # Rows with notably high values + +expect_true(mean(original_anomaly_scores$AnomalyScores[high_value_rows]) > + mean(original_anomaly_scores$AnomalyScores[zero_rows]), + info = "High value rows should score higher than zero rows") + +# Test 5: Edge Cases and Boundary Conditions +base_df_10_edge = create_base_df(10) + +# Test with all identical values +identical_scores = run_quality_metrics( + base_df_10_edge, + rep(1.0, 10), # All identical + rep(1.0, 10), # All identical + rep(1.0, 10) # All identical +) + +expect_true(is.data.frame(identical_scores) && nrow(identical_scores) == 10, + info = "Function should handle identical values gracefully") + +# Test with minimum viable dataset +base_df_3 = create_base_df(3) +minimal_scores = run_quality_metrics( + base_df_3, + c(0.1, 0.2, 5.0), # Clear outlier + c(0.1, 0.2, 0.1), + c(0.1, 0.1, 0.1) +) + +# The outlier (row 3) should have highest score +expect_equal(which.max(minimal_scores$AnomalyScores), 3, + info = "Clear outlier should have highest score in minimal dataset") + +# Test 6: Scale Invariance Testing +base_df_5 = create_base_df(5) + +# Original data +original_scale_scores = run_quality_metrics( + base_df_5, + c(0.1, 0.2, 0.3, 1.0, 2.0), + c(0.1, 0.1, 0.5, 0.2, 0.1), + c(0.1, 0.3, 0.1, 0.8, 0.2) +) + +# Scaled data (multiply by 10) +scaled_scale_scores = run_quality_metrics( + base_df_5, + c(0.1, 0.2, 0.3, 1.0, 2.0) * 10, + c(0.1, 0.1, 0.5, 0.2, 0.1) * 10, + c(0.1, 0.3, 0.1, 0.8, 0.2) * 10 +) + +# Relative ranking should be preserved under scaling +original_ranking = order(original_scale_scores$AnomalyScores, decreasing = TRUE) +scaled_ranking = order(scaled_scale_scores$AnomalyScores, decreasing = TRUE) + +# Rankings should be identical or very similar +expect_true(cor(original_ranking, scaled_ranking, method = "spearman") > 0.95, + info = "Relative rankings should be preserved under scaling") + +# Test 7: Regression Testing with Original Data +regression_scores = run_quality_metrics( + base_df_20_orig, + c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94), + c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00), + c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91) +) + +# Basic sanity checks +expect_equal(nrow(regression_scores), 20, + info = "Output should have same number of rows as input") +expect_true(all(regression_scores$AnomalyScores >= 0), + info = "All anomaly scores should be non-negative") +expect_true(all(is.finite(regression_scores$AnomalyScores)), + info = "All anomaly scores should be finite") + +# Row 20 (highest mean_increase = 2.94) should be highly anomalous +expect_true(regression_scores$AnomalyScores[20] > quantile(regression_scores$AnomalyScores, 0.8), + info = "Row with highest mean_increase should be highly anomalous") + +# Store results for regression testing (compare against future runs) +cat("Top 5 anomalous rows:", order(regression_scores$AnomalyScores, decreasing = TRUE)[1:5], "\n") +cat("Score range:", range(regression_scores$AnomalyScores), "\n") + +# Test 8: Zero/Null Data Handling +base_df_10_zero = create_base_df(10) + +# All zeros data +zero_scores = run_quality_metrics( + base_df_10_zero, + rep(0, 10), # All zeros + rep(0, 10), # All zeros + rep(0, 10) # All zeros +) + +expect_true(is.data.frame(zero_scores) && nrow(zero_scores) == 10, + info = "Function should handle all-zero data gracefully") + +# All scores should be similar/low for identical data +expect_true(sd(zero_scores$AnomalyScores) < 0.1, + info = "All-zero data should have low variance in scores") + +# Test 9: Relative Ranking Preservation +base_df_6_rank = create_base_df(6) + +# Create data with obvious ranking: Row 6 > Row 5 > Row 4 > Rows 1,2,3 +ranking_scores = run_quality_metrics( + base_df_6_rank, + c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), + c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0), + c(0.1, 0.1, 0.1, 1.0, 2.0, 5.0) +) + +# Row 5 should have highest score, Row 4 second highest, etc. +expect_true(ranking_scores$AnomalyScores[6] > ranking_scores$AnomalyScores[5], + info = "Row 6 should score higher than Row 5") +expect_true(ranking_scores$AnomalyScores[5] > ranking_scores$AnomalyScores[4], + info = "Row 5 should score higher than Row 4") +expect_true(ranking_scores$AnomalyScores[4] > max(ranking_scores$AnomalyScores[1:3]), + info = "Row 4 should score higher than Rows 1-3") + +# Test 10: Original Quality Metrics Calculation Test (from the beginning of the file) +# Test add_increase, add_decrease, add_dispersion +quality_vector = c( + 1.31, 0.05, 0.76, -0.79, -0.71, + -0.56, 1.00, -1.66, -0.33, 0.34, + 0.00, -0.40, -1.77, -1.07, 1.05, + 0.82, 0.74, 0.81, 0.79, 1.73 +) + +mean_increase = MSstatsConvert:::.add_mean_increase(quality_vector) +mean_decrease = MSstatsConvert:::.add_mean_decrease(quality_vector) +dispersion_increase = MSstatsConvert:::.add_dispersion_increase(quality_vector) + +expect_equal( + mean_increase, + c(0.00, 0.00, 0.26, 0.00, 0.00, + 0.00, 0.50, 0.00, 0.00, 0.00, + 0.00, 0.00, 0.00, 0.00, 0.55, + 0.87, 1.11, 1.42, 1.71, 2.94), + info = "Mean increase calculation should match expected values" +) + +expect_equal( + mean_decrease, + c(0.00, 0.00, 0.00, 0.29, 0.50, + 0.56, 0.00, 1.16, 0.99, 0.15, + 0.00, 0.00, 1.27, 1.84, 0.29, + 0.00, 0.00, 0.00, 0.00, 0.00), + info = "Mean decrease calculation should match expected values" +) + +expect_equal( + dispersion_increase, + c(0.00, 0.00, 0.00, 0.00, 0.00, + 0.00, 0.01, 0.85, 0.00, 0.00, + 0.00, 0.00, 0.96, 1.07, 1.15, + 0.89, 0.50, 0.22, 0.00, 0.91), + tolerance = 1e-2, + info = "Dispersion increase calculation should match expected values" +) + +cat("All anomaly detection tests completed successfully!\n") + +# Test n_feat and missing_run_count parameters in .prepareSpectronautAnomalyInput +spectronaut_raw = data.table::data.table( + ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)), + PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5), + rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5), + rep("LowAbundancePeptide", 5)), + PrecursorCharge = rep(2, 25), + FragmentIon = rep("y3", 25), + ProductCharge = rep(1, 25), + IsotopeLabelType = rep("L", 25), + Condition = rep(c("A", "A", "A", "B", "B"), 5), + BioReplicate = rep(seq(1:5), 5), + Run = rep(paste0("Run", seq(1:5)), 5), + Intensity = c(1000, 1500, 2000, 2500, 3000, + 1100, 1600, 2100, 2600, 3100, + 1200, NA, 2200, NA, NA, + 1300, 1800, NA, 2800, NA, + 100, 200, 300, 400, 500), + QualityMetric1 = rep(0.5, 25) +) + +temporal = spectronaut_raw[, .(Condition = unique(Condition), + Run = unique(Run)), + by = BioReplicate] +temporal$Order = seq(1:nrow(temporal)) +temporal = temporal[, c("Run", "Order")] + +missing_run_excluded = MSstatsConvert:::.prepareSpectronautAnomalyInput( + spectronaut_raw, + quality_metrics = c("QualityMetric1"), + run_order = temporal, + n_feat = 2, + missing_run_count = 0.5) +expect_false("AFPLAEWQPSDVDQR" %in% missing_run_excluded$PeptideSequence) +expect_true("LowAbundancePeptide" %in% missing_run_excluded$PeptideSequence) + +low_abundance_excluded = MSstatsConvert:::.prepareSpectronautAnomalyInput( + spectronaut_raw, + quality_metrics = c("QualityMetric1"), + run_order = temporal, + n_feat = 2, + missing_run_count = 0.95) +expect_true("AFPLAEWQPSDVDQR" %in% low_abundance_excluded$PeptideSequence) +expect_false("LowAbundancePeptide" %in% low_abundance_excluded$PeptideSequence) diff --git a/inst/tinytest/test_utils_data_health.R b/inst/tinytest/test_utils_data_health.R new file mode 100644 index 00000000..4feb54f4 --- /dev/null +++ b/inst/tinytest/test_utils_data_health.R @@ -0,0 +1,235 @@ +# Test data setup +create_test_data = function() { + data.table::data.table( + ProteinName = c(rep("Q9UFW8", 10), rep("Q96S19", 15)), + PeptideSequence = c(rep("AEFEEQNVR", 5), rep("TALYVTPLDR", 5), + rep("AFPLAEWQPSDVDQR", 5), rep("ASGLLLER", 5), + rep("LowAbundancePeptide", 5)), + PrecursorCharge = rep(2, 25), + FragmentIon = rep("y3", 25), + ProductCharge = rep(1, 25), + IsotopeLabelType = rep("L", 25), + Condition = rep(c("A", "A", "A", "B", "B"), 5), + BioReplicate = rep(seq(1:5), 5), + Run = rep(paste0("Run", seq(1:5)), 5), + Intensity = c(1000, 1500, 2000, 2500, 3000, + 1100, 1600, 2100, 2600, 3100, + 1200, 1700, 2200, 2000, 1700, + 1300, 1800, 1200, 2800, 1800, + 100, 200, 300, 400, 500), + AnomalyScores = rep(c(0.01,0.01,0.01,0.7,0.5), 5) + ) +} + +# Test .checkMissing function +test_data = create_test_data() + +# Test with no missing values +result = MSstatsConvert:::.checkMissing(test_data) +expect_equal(result, 0, info = ".checkMissing returns 0 for no missing values") +expect_true(is.numeric(result), info = ".checkMissing returns numeric value") + +# Test with some missing values +test_data_missing = test_data +test_data_missing$Intensity[1:5] = NA +result_missing = MSstatsConvert:::.checkMissing(test_data_missing) +expect_equal(result_missing, 5/25, info = ".checkMissing correctly calculates missing percentage") + +# Test with all missing values +test_data_all_missing = test_data +test_data_all_missing$Intensity = NA +result_all_missing = MSstatsConvert:::.checkMissing(test_data_all_missing) +expect_equal(result_all_missing, 1, info = ".checkMissing returns 1 for all missing values") + +# Test .checkIntensityDistribution function +test_data = create_test_data() + +# Test normal distribution +result = MSstatsConvert:::.checkIntensityDistribution(test_data) +expect_true(is.logical(result), info = ".checkIntensityDistribution returns logical value") + +# Test with low intensities that would trigger zero truncation warning +test_data_low = test_data +test_data_low$Intensity = c(rep(1, 10), rep(2, 15)) +result_low = MSstatsConvert:::.checkIntensityDistribution(test_data_low) +expect_true(is.logical(result_low), info = ".checkIntensityDistribution handles low intensity data") + +# Test .checkFeatureSD function +test_data = create_test_data() +test_data$Feature = paste(test_data$PeptideSequence, + test_data$PrecursorCharge, + test_data$FragmentIon, + test_data$ProductCharge, sep="_") + +result = MSstatsConvert:::.checkFeatureSD(test_data) + +# Check structure +expect_true(inherits(result, "data.table"), info = ".checkFeatureSD returns data.table") +expect_true("Feature" %in% names(result), info = ".checkFeatureSD includes Feature column") +expect_true("sd_Intensity" %in% names(result), info = ".checkFeatureSD includes sd_Intensity column") +expect_true("mean_Intensity" %in% names(result), info = ".checkFeatureSD includes mean_Intensity column") +expect_true("ratio" %in% names(result), info = ".checkFeatureSD includes ratio column") + +# Check that we get one row per unique feature +expected_features = unique(test_data$Feature) +expect_equal(nrow(result), length(expected_features), info = ".checkFeatureSD returns one row per feature") + +# Check that calculations make sense +expect_true(all(result$mean_Intensity > 0, na.rm = TRUE), info = ".checkFeatureSD mean intensities are positive") +expect_true(all(result$sd_Intensity >= 0, na.rm = TRUE), info = ".checkFeatureSD standard deviations are non-negative") +expect_true(all(result$ratio >= 0, na.rm = TRUE), info = ".checkFeatureSD ratios are non-negative") + +# Test .checkFeatureOutliers function +test_data = create_test_data() +test_data$Feature = paste(test_data$PeptideSequence, + test_data$PrecursorCharge, + test_data$FragmentIon, + test_data$ProductCharge, sep="_") + +feature_data = MSstatsConvert:::.checkFeatureSD(test_data) +result = MSstatsConvert:::.checkFeatureOutliers(test_data, feature_data) + +# Check structure +expect_true(is.list(result), info = ".checkFeatureOutliers returns list") +expect_equal(length(result), 2, info = ".checkFeatureOutliers returns list of length 2") + +enhanced_feature_data = result[[1]] +outlier_percent = result[[2]] + +expect_true("outliers" %in% names(enhanced_feature_data), info = ".checkFeatureOutliers adds outliers column") +expect_true(inherits(enhanced_feature_data, "data.table"), info = ".checkFeatureOutliers returns data.table as first element") +expect_true(is.numeric(outlier_percent), info = ".checkFeatureOutliers returns numeric outlier percentage") +expect_true(outlier_percent >= 0 && outlier_percent <= 1, info = ".checkFeatureOutliers outlier percentage is between 0 and 1") + +# Test .checkFeatureCoverage function +test_data = create_test_data() +test_data$Feature = paste(test_data$PeptideSequence, + test_data$PrecursorCharge, + test_data$FragmentIon, + test_data$ProductCharge, sep="_") + +feature_data = MSstatsConvert:::.checkFeatureSD(test_data) +result = MSstatsConvert:::.checkFeatureCoverage(test_data, feature_data) + +# Check structure +expect_true(inherits(result, "data.table"), info = ".checkFeatureCoverage returns data.table") +expect_true("percent_missing" %in% names(result), info = ".checkFeatureCoverage includes percent_missing column") + +# With no missing values, percent_missing should be 0 +expect_true(all(result$percent_missing == 0), info = ".checkFeatureCoverage shows 0% missing for complete data") + +# Test with missing values +test_data_missing = test_data +test_data_missing$Intensity[1:3] = NA +result_missing = MSstatsConvert:::.checkFeatureCoverage(test_data_missing, feature_data) + +# First feature should have some missing percentage +first_feature = result_missing$Feature[1] +first_feature_missing = result_missing[Feature == first_feature]$percent_missing +expect_true(first_feature_missing > 0, info = ".checkFeatureCoverage detects missing values") + +# Test pearson_skewness function +# Test symmetric distribution (should be near 0) +symmetric_data = c(1, 2, 3, 4, 4, 3, 2, 1, 5) +skew_symmetric = MSstatsConvert:::pearson_skewness(symmetric_data) +expect_true(abs(skew_symmetric) < 0.5, info = "pearson_skewness returns near-zero for symmetric data") + +# Test right-skewed distribution (should be positive) +right_skewed = c(1, 1, 1, 1, 2, 2, 3, 4, 10) +skew_right = MSstatsConvert:::pearson_skewness(right_skewed) +expect_true(skew_right > 0, info = "pearson_skewness returns positive for right-skewed data") + +# Test left-skewed distribution (should be negative) +left_skewed = c(10, 10, 10, 10, 9, 9, 8, 7, 1) +skew_left = MSstatsConvert:::pearson_skewness(left_skewed) +expect_true(skew_left < 0, info = "pearson_skewness returns negative for left-skewed data") + +# Test .checkAnomalySkew function +test_data = create_test_data() +result = MSstatsConvert:::.checkAnomalySkew(test_data) + +# Check structure +expect_true(inherits(result, "data.table"), info = ".checkAnomalySkew returns data.table") +expect_true("PSM" %in% names(result), info = ".checkAnomalySkew includes PSM column") +expect_true("skew" %in% names(result), info = ".checkAnomalySkew includes skew column") + +# Check that we get one row per unique PSM +expected_psms = unique(paste(test_data$PeptideSequence, + test_data$PrecursorCharge, sep="_")) +expect_equal(nrow(result), length(expected_psms), info = ".checkAnomalySkew returns one row per PSM") + +# Check that skewness values are numeric and finite (FAIL) +expect_true(all(is.numeric(result$skew)), info = ".checkAnomalySkew returns numeric skewness values") +expect_true(all(is.finite(result$skew)), info = ".checkAnomalySkew returns finite skewness values") + +# Verify PSM creation +expected_psm_1 = paste("AEFEEQNVR", "2", sep="_") +expect_true(expected_psm_1 %in% result$PSM, info = ".checkAnomalySkew creates correct PSM identifiers") + +# Test CheckDataHealth main function +test_data = create_test_data() +result = CheckDataHealth(test_data) + +# Check structure +expect_true(is.list(result), info = "CheckDataHealth returns list") +expect_equal(length(result), 2, info = "CheckDataHealth returns list of length 2") + +feature_data = result[[1]] +skew_results = result[[2]] + +# Check feature_data structure +expect_true(inherits(feature_data, "data.table"), info = "CheckDataHealth returns feature_data as data.table") +expected_cols = c("Feature", "sd_Intensity", "mean_Intensity", + "ratio", "outliers", "percent_missing") +expect_true(all(expected_cols %in% names(feature_data)), info = "CheckDataHealth feature_data has all expected columns") + +# Check skew_results structure +expect_true(inherits(skew_results, "data.table"), info = "CheckDataHealth returns skew_results as data.table") +expect_true(all(c("PSM", "skew") %in% names(skew_results)), info = "CheckDataHealth skew_results has expected columns") + +# Verify Feature creation in main function +expected_feature = paste("AEFEEQNVR", "2", "y3", "1", sep="_") +expect_true(expected_feature %in% feature_data$Feature, info = "CheckDataHealth creates correct Feature identifiers") + +# Test with all same intensities (zero variance) +test_data_same = create_test_data() +test_data_same$Intensity = 1000 +result_same = CheckDataHealth(test_data_same) +expect_true(is.list(result_same), info = "CheckDataHealth handles zero variance data") + +# Check that SD is 0 for features with same intensities +feature_data_same = result_same[[1]] +expect_true(all(feature_data_same$sd_Intensity == 0), info = "CheckDataHealth correctly calculates zero SD for identical intensities") + +# Test CheckDataHealth with different anomaly score patterns +test_data = create_test_data() + +# Test with highly skewed anomaly scores +test_data_skewed = test_data +test_data_skewed$AnomalyScores = c(rep(0.01, 20), rep(0.99, 5)) +result_skewed = CheckDataHealth(test_data_skewed) + +skew_results = result_skewed[[2]] +expect_true(inherits(skew_results, "data.table"), info = "CheckDataHealth handles skewed anomaly scores") +expect_true(all(c("PSM", "skew") %in% names(skew_results)), info = "CheckDataHealth skew results maintain structure with skewed data") + +# Test with uniform anomaly scores +test_data_uniform = test_data +test_data_uniform$AnomalyScores = 0.5 +result_uniform = CheckDataHealth(test_data_uniform) + +skew_uniform = result_uniform[[2]] +# Uniform distribution should have skewness near 0 (but will be NaN for truly uniform) +expect_true(all(is.finite(skew_uniform$skew) | is.nan(skew_uniform$skew)), info = "CheckDataHealth handles uniform anomaly scores appropriately") + +# Test additional edge case: different anomaly score distributions per PSM +test_data_mixed = create_test_data() +# Create more varied anomaly scores to test skewness calculation +test_data_mixed$AnomalyScores[1:5] = c(0.1, 0.2, 0.3, 0.8, 0.9) # Right skewed for first PSM +test_data_mixed$AnomalyScores[6:10] = c(0.9, 0.8, 0.3, 0.2, 0.1) # Left skewed for second PSM +result_mixed = CheckDataHealth(test_data_mixed) + +skew_mixed = result_mixed[[2]] +expect_equal(nrow(skew_mixed), 5, info = "CheckDataHealth calculates skewness for all unique PSMs") +expect_true(any(skew_mixed$skew > 0) || any(skew_mixed$skew < 0), info = "CheckDataHealth detects skewness in anomaly scores") + diff --git a/man/MSstatsConvert.Rd b/man/MSstatsConvert.Rd index ab9be157..387e6ff6 100644 --- a/man/MSstatsConvert.Rd +++ b/man/MSstatsConvert.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils_MSstatsConvert.R -\docType{_PACKAGE} +\docType{package} \name{MSstatsConvert} +\alias{MSstatsConvert-package} \alias{MSstatsConvert} \title{MSstatsConvert: An R Package to Convert Data from Mass Spectrometry Signal Processing Tools to MSstats Format} \description{ @@ -17,3 +18,17 @@ signal processing tools to a format suitable for statistical analysis with the M \code{\link{MSstatsBalancedDesign}} for handling fractions and creating balanced data. } +\author{ +\strong{Maintainer}: Mateusz Staniak \email{mtst@mstaniak.pl} + +Authors: +\itemize{ + \item Devon Kohler \email{kohler.d@northeastern.edu} + \item Anthony Wu \email{wu.anthon@northeastern.edu} + \item Meena Choi \email{mnchoi67@gmail.com} + \item Ting Huang \email{thuang0703@gmail.com} + \item Olga Vitek \email{o.vitek@northeastern.edu} +} + +} +\keyword{internal} diff --git a/man/SpectronauttoMSstatsFormat.Rd b/man/SpectronauttoMSstatsFormat.Rd index 9613465c..7e93e486 100644 --- a/man/SpectronauttoMSstatsFormat.Rd +++ b/man/SpectronauttoMSstatsFormat.Rd @@ -57,7 +57,7 @@ We assume to use unique peptide for each protein.} \item{anomalyModelFeatures}{character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.} -\item{anomalyModelFeatureTemporal}{character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: \code{mean_decrease}, \code{mean_incrase}, \code{dispersion_increase}, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).} +\item{anomalyModelFeatureTemporal}{character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: \code{mean_decrease}, \code{mean_increase}, \code{dispersion_increase}, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).} \item{removeMissingFeatures}{Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.} diff --git a/man/dot-MSstatsFormat.Rd b/man/dot-MSstatsFormat.Rd index 784a4de8..33601830 100644 --- a/man/dot-MSstatsFormat.Rd +++ b/man/dot-MSstatsFormat.Rd @@ -4,10 +4,12 @@ \alias{.MSstatsFormat} \title{Output format for further analysis by MSstats} \usage{ -.MSstatsFormat(input, anomaly_metrics) +.MSstatsFormat(input, anomaly_metrics = c()) } \arguments{ \item{input}{data.table} + +\item{anomaly_metrics}{character vector of quality metric column names to be used as features in an anomaly detection model} } \value{ object of class MSstatsValidated that inherits from data.frame diff --git a/man/dot-cleanByFeature.Rd b/man/dot-cleanByFeature.Rd index 151ed40d..31aca239 100644 --- a/man/dot-cleanByFeature.Rd +++ b/man/dot-cleanByFeature.Rd @@ -4,7 +4,12 @@ \alias{.cleanByFeature} \title{Perform by-feature operations.} \usage{ -.cleanByFeature(input, feature_columns, cleaning_control, anomaly_metrics) +.cleanByFeature( + input, + feature_columns, + cleaning_control, + anomaly_metrics = c() +) } \arguments{ \item{input}{\code{data.table} preprocessed by one of the cleanRaw* functions.} @@ -13,6 +18,9 @@ \item{cleaning_control}{named list of two or three elements. See the documentation for \code{MSstatsImport} for details.} + +\item{anomaly_metrics}{character vector of quality metric column names to be +used as features in an anomaly detection model.} } \value{ \code{data.table} diff --git a/man/dot-makeBalancedDesign.Rd b/man/dot-makeBalancedDesign.Rd index b02f31c0..fc91a6a3 100644 --- a/man/dot-makeBalancedDesign.Rd +++ b/man/dot-makeBalancedDesign.Rd @@ -4,12 +4,15 @@ \alias{.makeBalancedDesign} \title{Fill missing rows to create balanced design} \usage{ -.makeBalancedDesign(input, fill_missing, anomaly_metrics) +.makeBalancedDesign(input, fill_missing, anomaly_metrics = c()) } \arguments{ \item{input}{output of \code{MSstatsPreprocess}} -\item{fill_missing}{if TRUE, missing Intensities values will be added to data +\item{fill_missing}{if TRUE, missing Intensities values will be added to data} + +\item{anomaly_metrics}{character vector of quality metric column names to be +used as features in an anomaly detection model. and marked as NA} } \value{ diff --git a/man/dot-summarizeMultipleMeasurements.Rd b/man/dot-summarizeMultipleMeasurements.Rd index 99f346de..3ffd104b 100644 --- a/man/dot-summarizeMultipleMeasurements.Rd +++ b/man/dot-summarizeMultipleMeasurements.Rd @@ -17,6 +17,9 @@ \item{aggregator}{function that will be used to aggregate duplicated values.} \item{feature_columns}{chr, vector of names of columns that define features.} + +\item{anomaly_metrics}{character vector of quality metric column names +to be used as features in an anomaly detection model.} } \value{ \code{data.table} diff --git a/man/dot-validateMSstatsConverterParameters.Rd b/man/dot-validateMSstatsConverterParameters.Rd new file mode 100644 index 00000000..58bffafc --- /dev/null +++ b/man/dot-validateMSstatsConverterParameters.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_checks.R +\name{.validateMSstatsConverterParameters} +\alias{.validateMSstatsConverterParameters} +\title{Generic parameter validation for all MSstats converters using configuration object} +\usage{ +.validateMSstatsConverterParameters(config) +} +\arguments{ +\item{config}{A list containing all converter parameters. See details for required structure.} +} +\value{ +NULL (throws error if validation fails) +} +\description{ +Generic parameter validation for all MSstats converters using configuration object +} +\details{ +The config list should contain the input and optionally other parameters: +\itemize{ +\item input: input data (required) +\item annotation: annotation data (optional) +\item intensity: intensity type (optional) +\item filter_with_Qvalue: Q-value filter setting (default: FALSE) +\item qvalue_cutoff: Q-value cutoff (default: 0.01) +\item useUniquePeptide: unique peptide setting (default: TRUE) +\item removeFewMeasurements: remove few measurements setting (default: TRUE) +\item removeProtein_with1Feature: remove single feature proteins setting (default: FALSE) +\item summaryforMultipleRows: aggregation function (default: max) +\item calculateAnomalyScores: anomaly detection setting (default: FALSE) +\item anomalyModelFeatures: anomaly model features (default: c()) +\item anomalyModelFeatureTemporal: temporal features (default: c()) +\item removeMissingFeatures: missing feature threshold (default: 0.5) +\item anomalyModelFeatureCount: feature count for anomaly model (default: 100) +\item runOrder: run order data (default: NULL) +\item n_trees: number of trees (default: 100) +\item max_depth: max tree depth (default: "auto") +\item numberOfCores: number of cores (default: 1) +\item use_log_file: logging setting (default: TRUE) +\item append: append setting (default: FALSE) +\item verbose: verbose setting (default: TRUE) +\item log_file_path: log file path (default: NULL) +\item excludedFromQuantificationFilter: filter setting (default: NULL) +} +} +\keyword{internal} diff --git a/src/MSstatsConvert.so b/src/MSstatsConvert.so index d404a4d9..5b583b4b 100755 Binary files a/src/MSstatsConvert.so and b/src/MSstatsConvert.so differ diff --git a/src/RcppExports.o b/src/RcppExports.o index 4f7ce75c..6be4a24a 100644 Binary files a/src/RcppExports.o and b/src/RcppExports.o differ diff --git a/src/isolation_forest.o b/src/isolation_forest.o index 975f11e3..93693eac 100644 Binary files a/src/isolation_forest.o and b/src/isolation_forest.o differ