Vitek-Lab · devonjkohler · Sep 9, 2025 · Jul 29, 2025 · Aug 22, 2025 · Sep 8, 2025
diff --git a/R/converters_SpectronauttoMSstatsFormat.R b/R/converters_SpectronauttoMSstatsFormat.R
@@ -8,7 +8,7 @@
 #' @param qvalue_cutoff Cutoff for EG.Qvalue. default is 0.01.
 #' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
 #' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
-#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_incrase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
+#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
 #' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
 #' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
 #' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
@@ -45,6 +45,34 @@ SpectronauttoMSstatsFormat = function(
         use_log_file = TRUE, append = FALSE, verbose = TRUE, 
         log_file_path = NULL, ...
 ) {
+    validation_config = list(
+        input = input, 
+        annotation = annotation, 
+        intensity = intensity, 
+        excludedFromQuantificationFilter = excludedFromQuantificationFilter,
+        filter_with_Qvalue = filter_with_Qvalue, 
+        qvalue_cutoff = qvalue_cutoff, 
+        useUniquePeptide = useUniquePeptide, 
+        removeFewMeasurements = removeFewMeasurements,
+        removeProtein_with1Feature = removeProtein_with1Feature, 
+        summaryforMultipleRows = summaryforMultipleRows, 
+        calculateAnomalyScores = calculateAnomalyScores,
+        anomalyModelFeatures = anomalyModelFeatures, 
+        anomalyModelFeatureTemporal = anomalyModelFeatureTemporal, 
+        removeMissingFeatures = removeMissingFeatures,
+        anomalyModelFeatureCount = anomalyModelFeatureCount, 
+        runOrder = runOrder, 
+        n_trees = n_trees, 
+        max_depth = max_depth, 
+        numberOfCores = numberOfCores,
+        use_log_file = use_log_file, 
+        append = append, 
+        verbose = verbose, 
+        log_file_path = log_file_path
+    )
+
+    .validateMSstatsConverterParameters(validation_config)
+
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, 
                                         log_file_path)
 

diff --git a/R/utils_MSstatsConvert.R b/R/utils_MSstatsConvert.R
@@ -11,7 +11,6 @@
 #' \code{\link{MSstatsBalancedDesign}} for handling fractions and creating balanced data.
 #'
 #' @import data.table
-#' @docType _PACKAGE
 #' @name MSstatsConvert
-#'
-NULL
+#' @keywords internal
+"_PACKAGE"
diff --git a/R/utils_balanced_design.R b/R/utils_balanced_design.R
@@ -1,10 +1,12 @@
 #' Fill missing rows to create balanced design
 #' @param input output of `MSstatsPreprocess`
 #' @param fill_missing if TRUE, missing Intensities values will be added to data 
+#' @param anomaly_metrics character vector of quality metric column names to be 
+#' used as features in an anomaly detection model.
 #' and marked as NA
 #' @return data.table
 #' @keywords internal
-.makeBalancedDesign = function(input, fill_missing, anomaly_metrics) {
+.makeBalancedDesign = function(input, fill_missing, anomaly_metrics = c()) {
     feature = NULL
 
     is_tmt = is.element("Channel", colnames(input))

diff --git a/R/utils_checks.R b/R/utils_checks.R
@@ -46,3 +46,208 @@
         chosen
     }
 }
+
+
+#' Generic parameter validation for all MSstats converters using configuration object
+#' @param config A list containing all converter parameters. See details for required structure.
+#' @details
+#' The config list should contain the input and optionally other parameters:
+#' - input: input data (required)
+#' - annotation: annotation data (optional)
+#' - intensity: intensity type (optional)
+#' - filter_with_Qvalue: Q-value filter setting (default: FALSE)
+#' - qvalue_cutoff: Q-value cutoff (default: 0.01)
+#' - useUniquePeptide: unique peptide setting (default: TRUE)
+#' - removeFewMeasurements: remove few measurements setting (default: TRUE)
+#' - removeProtein_with1Feature: remove single feature proteins setting (default: FALSE)
+#' - summaryforMultipleRows: aggregation function (default: max)
+#' - calculateAnomalyScores: anomaly detection setting (default: FALSE)
+#' - anomalyModelFeatures: anomaly model features (default: c())
+#' - anomalyModelFeatureTemporal: temporal features (default: c())
+#' - removeMissingFeatures: missing feature threshold (default: 0.5)
+#' - anomalyModelFeatureCount: feature count for anomaly model (default: 100)
+#' - runOrder: run order data (default: NULL)
+#' - n_trees: number of trees (default: 100)
+#' - max_depth: max tree depth (default: "auto")
+#' - numberOfCores: number of cores (default: 1)
+#' - use_log_file: logging setting (default: TRUE)
+#' - append: append setting (default: FALSE)
+#' - verbose: verbose setting (default: TRUE)
+#' - log_file_path: log file path (default: NULL)
+#' - excludedFromQuantificationFilter: filter setting (default: NULL)
+#' @return NULL (throws error if validation fails)
+#' @keywords internal
+.validateMSstatsConverterParameters = function(config) {
+
+    # Ensure config is a list
+    if (!is.list(config)) {
+        stop("Config must be a list")
+    }
+
+    # Define all defaults in one place
+    defaults = list(
+        input = NULL,
+        annotation = NULL,
+        intensity = NULL,
+        excludedFromQuantificationFilter = NULL,
+        filter_with_Qvalue = FALSE,
+        qvalue_cutoff = 0.01,
+        useUniquePeptide = TRUE,
+        removeFewMeasurements = TRUE,
+        removeProtein_with1Feature = FALSE,
+        summaryforMultipleRows = max,
+        calculateAnomalyScores = FALSE,
+        anomalyModelFeatures = c(),
+        anomalyModelFeatureTemporal = c(),
+        removeMissingFeatures = 0.5,
+        anomalyModelFeatureCount = 100,
+        runOrder = NULL,
+        n_trees = 100,
+        max_depth = "auto",
+        numberOfCores = 1,
+        use_log_file = TRUE,
+        append = FALSE,
+        verbose = TRUE,
+        log_file_path = NULL
+    )
+
+    # Merge config with defaults (config values override defaults)
+    config = modifyList(defaults, config)
+
+    # Input data validation (fail immediately if data is invalid)
+    if (is.null(config$input)) {
+        stop("Input data cannot be NULL")
+    }
+
+    if (is.character(config$input)) {
+        if (!file.exists(config$input)) {
+            stop("Input file does not exist: ", config$input)
+        }
+        # Quick file size check for very large files
+        file_size_mb = file.size(config$input) / (1024^2)
+        if (file_size_mb > 1000) {  # Warn for files > 1GB
+            warning("Large input file detected (", round(file_size_mb, 1), " MB). ",
+                    "Consider validating parameters on a subset first.")
+        }
+    } else if (is.data.frame(config$input) || data.table::is.data.table(config$input)) {
+        # Quick structural validation
+        if (nrow(config$input) == 0) {
+            stop("Input data is empty (0 rows)")
+        }
+        if (ncol(config$input) == 0) {
+            stop("Input data has no columns")
+        }
+    } else {
+        stop("Input must be a file path, data.frame, or data.table")
+    }
+
+    # Annotation validation
+    if (!is.null(config$annotation)) {
+        if (is.character(config$annotation)) {
+            if (!file.exists(config$annotation)) {
+                stop("Annotation file does not exist: ", config$annotation)
+            }
+        } else if (!is.data.frame(config$annotation) && !data.table::is.data.table(config$annotation)) {
+            stop("Annotation must be NULL, a file path, data.frame, or data.table")
+        }
+    }
+
+    # Intensity validation (if provided)
+    if (!is.null(config$intensity)) {
+        checkmate::assertString(config$intensity)
+    }
+
+    # Q-value filtering parameters
+    checkmate::assertLogical(config$filter_with_Qvalue, len = 1)
+    checkmate::assertNumber(config$qvalue_cutoff, lower = 0, upper = 1)
+
+    # Common processing parameters
+    checkmate::assertLogical(config$useUniquePeptide, len = 1)
+    checkmate::assertLogical(config$removeFewMeasurements, len = 1)
+    checkmate::assertLogical(config$removeProtein_with1Feature, len = 1)
+
+    # Aggregation function validation
+    if (!is.null(config$summaryforMultipleRows)) {
+        checkmate::assertFunction(config$summaryforMultipleRows)
+    }
+
+    # Core system parameters
+    checkmate::assertInt(config$numberOfCores, lower = 1)
+    checkmate::assertLogical(config$use_log_file, len = 1)
+    checkmate::assertLogical(config$append, len = 1)
+    checkmate::assertLogical(config$verbose, len = 1)
+
+    # Converter-specific boolean parameters (if provided)
+    if (!is.null(config$excludedFromQuantificationFilter)) {
+        checkmate::assertLogical(config$excludedFromQuantificationFilter, len = 1)
+    }
+
+    checkmate::assertLogical(config$calculateAnomalyScores, len = 1)
+
+    # Anomaly detection parameter validation (converter-specific)
+    if (config$calculateAnomalyScores) {
+        # These validations only matter if anomaly detection is enabled
+        if (length(config$anomalyModelFeatures) == 0) {
+            stop("anomalyModelFeatures cannot be empty when calculateAnomalyScores=TRUE")
+        }
+        checkmate::assertCharacter(config$anomalyModelFeatures, min.len = 1)
+
+        if (length(config$anomalyModelFeatureTemporal) > 0) {
+            if (length(config$anomalyModelFeatureTemporal) != length(config$anomalyModelFeatures)) {
+                stop("anomalyModelFeatureTemporal must have same length as anomalyModelFeatures or be empty")
+            }
+            valid_temporal = c("mean_decrease", "mean_increase", "dispersion_increase")
+            invalid_temporal = config$anomalyModelFeatureTemporal[
+                !is.null(config$anomalyModelFeatureTemporal) & 
+                    !config$anomalyModelFeatureTemporal %in% valid_temporal
+            ]
+            if (length(invalid_temporal) > 0) {
+                stop("Invalid temporal directions: ", paste(invalid_temporal, collapse = ", "),
+                     ". Must be one of: ", paste(valid_temporal, collapse = ", "), " or NULL")
+            }
+        }
+
+        checkmate::assertInt(config$n_trees, lower = 1)
+        if (is.character(config$max_depth)) {
+            checkmate::assertChoice(config$max_depth, choices = "auto")
+        } else {
+            checkmate::assertInt(config$max_depth, lower = 1)
+        }
+        checkmate::assertInt(config$anomalyModelFeatureCount, lower = 1)
+        checkmate::assertNumber(config$removeMissingFeatures, lower = 0, upper = 1)
+
+        if (!is.null(config$runOrder)) {
+            if (!is.data.frame(config$runOrder) && !data.table::is.data.table(config$runOrder)) {
+                stop("runOrder must be a data.frame or data.table")
+            }
+            required_cols = c("Run", "Order")
+            missing_cols = setdiff(required_cols, colnames(config$runOrder))
+            if (length(missing_cols) > 0) {
+                stop("runOrder is missing required columns: ", paste(missing_cols, collapse = ", "))
+            }
+            if (!is.numeric(config$runOrder$Order)) {
+                stop("runOrder$Order must be numeric")
+            }
+        }
+    } else {
+        # When anomaly detection is disabled, these parameters are ignored but warn user
+        if (length(config$anomalyModelFeatures) > 0) {
+            warning("anomalyModelFeatures provided but calculateAnomalyScores=FALSE, ignoring")
+        }
+        if (!is.null(config$runOrder)) {
+            warning("runOrder provided but calculateAnomalyScores=FALSE, ignoring")
+        }
+    }
+
+    # Log file validation
+    if (!is.null(config$log_file_path)) {
+        checkmate::assertString(config$log_file_path)
+        log_dir = dirname(config$log_file_path)
+        if (!dir.exists(log_dir)) {
+            stop("Log file directory does not exist: ", log_dir)
+        }
+        if (!file.access(log_dir, mode = 2) == 0) {
+            stop("No write permission for log file directory: ", log_dir)
+        }
+    }
+}
diff --git a/R/utils_classes.R b/R/utils_classes.R
@@ -5,10 +5,11 @@ setOldClass("MSstatsValidated", S4Class = "MSstatsValidated")
 
 #' Output format for further analysis by MSstats
 #' @param input data.table
+#' @param anomaly_metrics character vector of quality metric column names to be used as features in an anomaly detection model
 #' @importFrom methods new
 #' @return object of class MSstatsValidated that inherits from data.frame
 #' @keywords internal
-.MSstatsFormat = function(input, anomaly_metrics) {
+.MSstatsFormat = function(input, anomaly_metrics = c()) {
     input = .selectMSstatsColumns(input, anomaly_metrics)
     new("MSstatsValidated", as.data.frame(input))
 }

diff --git a/R/utils_clean_features.R b/R/utils_clean_features.R
@@ -3,10 +3,12 @@
 #' @param feature_columns character vector of names of columns that define features.
 #' @param cleaning_control named list of two or three elements. 
 #' See the documentation for `MSstatsImport` for details.
+#' @param anomaly_metrics character vector of quality metric column names to be 
+#' used as features in an anomaly detection model.
 #' @return `data.table`
 #' @keywords internal 
 .cleanByFeature = function(input, feature_columns, 
-                           cleaning_control, anomaly_metrics) {
+                           cleaning_control, anomaly_metrics = c()) {
     if (is.element("Channel", colnames(input))) {
         input = .filterFewMeasurements(
             input, 0, 
@@ -86,7 +88,9 @@
 #' Summarize multiple measurements per feature in a single run
 #' @param input `data.table` pre-processed by one of the .cleanRaw* functions.
 #' @param aggregator function that will be used to aggregate duplicated values.
-#' @param feature_columns chr, vector of names of columns that define features. 
+#' @param feature_columns chr, vector of names of columns that define features.
+#' @param anomaly_metrics character vector of quality metric column names 
+#' to be used as features in an anomaly detection model.
 #' @return `data.table`
 #' @keywords internal
 .summarizeMultipleMeasurements = function(input, aggregator, 

diff --git a/inst/tinytest/test_cleanRaw.R b/inst/tinytest/test_cleanRaw.R
@@ -193,22 +193,31 @@ expect_true(nrow(sm_cleaned) > 0)
 expect_error(MSstatsConvert::MSstatsClean(spectromine_import_error))
 # Spectronaut
 spectronaut_input = data.table::fread("./raw_data/Spectronaut/spectronaut_input.csv")
-spectronaut_input2 = data.table::copy(spectronaut_input)
-spectronaut_input2$F.ExcludedFromQuantification = ifelse(
-    spectronaut_input2$F.ExcludedFromQuantification,
-    "True", "False"
-)
 spectronaut_import = MSstatsConvert::MSstatsImport(list(input = spectronaut_input), 
                                                    "MSstats", "Spectronaut")
-spectronaut_import2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input2), 
-                                                    "MSstats", "Spectronaut")
 sn_cleaned = MSstatsConvert::MSstatsClean(spectronaut_import,
-                                          intensity = "PeakArea")
-sn_cleaned2 = MSstatsConvert::MSstatsClean(spectronaut_import2,
-                                           intensity = "PeakArea")
-expect_equal(ncol(sn_cleaned), 11)
+                                          intensity = "PeakArea", 
+                                          calculateAnomalyScores = FALSE, 
+                                          anomalyModelFeatures = c())
+expect_equal(ncol(sn_cleaned), 12)
 expect_true(nrow(sn_cleaned) > 0)
-expect_equal(sn_cleaned, sn_cleaned2)
+
+# Test new peak quality columns
+spectronaut_input_2 = spectronaut_input
+spectronaut_input_2$`FG.ShapeQualityScore (MS2)` = 1
+spectronaut_input_2$`FG.ShapeQualityScore (MS1)` = 1
+spectronaut_input_2$`EG.ApexRT` = 1
+spectronaut_input_2$`F.PossibleInterference` = TRUE
+spectronaut_import_2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input_2), 
+                                                   "MSstats", "Spectronaut")
+sn_cleaned_2 = MSstatsConvert::MSstatsClean(spectronaut_import_2,
+                                          intensity = "PeakArea", 
+                                          calculateAnomalyScores = TRUE, 
+                                          anomalyModelFeatures = c("FGShapeQualityScore(MS2)",
+                                                                   "FGShapeQualityScore(MS1)",
+                                                                   "EGApexRT"))
+expect_equal(ncol(sn_cleaned_2), 16)
+expect_true(nrow(sn_cleaned_2) > 0)
 
 # Metamorpheus
 metamorpheus_table = data.table::fread("./raw_data/Metamorpheus/QuantifiedPeaks.tsv")