Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion R/converters_SpectronauttoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#' @param qvalue_cutoff Cutoff for EG.Qvalue. default is 0.01.
#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_incrase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
#' @param anomalyModelFeatureTemporal character vector of temporal direction corresponding to columns passed to anomalyModelFeatures. Values must be one of: `mean_decrease`, `mean_increase`, `dispersion_increase`, or NULL (to perform no temporal feature engineering). Default is empty vector. If calculateAnomalyScores=TRUE, vector must have as many values as anomalyModelFeatures (even if all NULL).
#' @param removeMissingFeatures Remove features with missing values in more than this fraction of runs. Default is 0.5. Only used if calculateAnomalyScores=TRUE.
#' @param anomalyModelFeatureCount Feature selection for anomaly model. Anomaly detection works on the precursor-level and can be much slower if all features used. We will by default filter to the top-100 highest intensity features. This can be adjusted as necessary. To turn feature-selection off, set this value to a high number (e.g. 10000). Only used if calculateAnomalyScores=TRUE.
#' @param runOrder Temporal order of MS runs. Should be a two column data.table with columns `Run` and `Order`, where `Run` matches the run name output by Spectronaut and `Order` is an integer. Used to engineer the temporal features defined in anomalyModelFeatureTemporal.
Expand Down Expand Up @@ -45,6 +45,34 @@ SpectronauttoMSstatsFormat = function(
use_log_file = TRUE, append = FALSE, verbose = TRUE,
log_file_path = NULL, ...
) {
validation_config = list(
input = input,
annotation = annotation,
intensity = intensity,
excludedFromQuantificationFilter = excludedFromQuantificationFilter,
filter_with_Qvalue = filter_with_Qvalue,
qvalue_cutoff = qvalue_cutoff,
useUniquePeptide = useUniquePeptide,
removeFewMeasurements = removeFewMeasurements,
removeProtein_with1Feature = removeProtein_with1Feature,
summaryforMultipleRows = summaryforMultipleRows,
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures,
anomalyModelFeatureTemporal = anomalyModelFeatureTemporal,
removeMissingFeatures = removeMissingFeatures,
anomalyModelFeatureCount = anomalyModelFeatureCount,
runOrder = runOrder,
n_trees = n_trees,
max_depth = max_depth,
numberOfCores = numberOfCores,
use_log_file = use_log_file,
append = append,
verbose = verbose,
log_file_path = log_file_path
)

.validateMSstatsConverterParameters(validation_config)

MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
log_file_path)

Expand Down
5 changes: 2 additions & 3 deletions R/utils_MSstatsConvert.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#' \code{\link{MSstatsBalancedDesign}} for handling fractions and creating balanced data.
#'
#' @import data.table
#' @docType _PACKAGE
#' @name MSstatsConvert
#'
NULL
#' @keywords internal
"_PACKAGE"
4 changes: 3 additions & 1 deletion R/utils_balanced_design.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#' Fill missing rows to create balanced design
#' @param input output of `MSstatsPreprocess`
#' @param fill_missing if TRUE, missing Intensities values will be added to data
#' @param anomaly_metrics character vector of quality metric column names to be
#' used as features in an anomaly detection model.
#' and marked as NA
#' @return data.table
#' @keywords internal
.makeBalancedDesign = function(input, fill_missing, anomaly_metrics) {
.makeBalancedDesign = function(input, fill_missing, anomaly_metrics = c()) {
feature = NULL

is_tmt = is.element("Channel", colnames(input))
Expand Down
205 changes: 205 additions & 0 deletions R/utils_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,208 @@
chosen
}
}


#' Generic parameter validation for all MSstats converters using configuration object
#' @param config A list containing all converter parameters. See details for required structure.
#' @details
#' The config list should contain the input and optionally other parameters:
#' - input: input data (required)
#' - annotation: annotation data (optional)
#' - intensity: intensity type (optional)
#' - filter_with_Qvalue: Q-value filter setting (default: FALSE)
#' - qvalue_cutoff: Q-value cutoff (default: 0.01)
#' - useUniquePeptide: unique peptide setting (default: TRUE)
#' - removeFewMeasurements: remove few measurements setting (default: TRUE)
#' - removeProtein_with1Feature: remove single feature proteins setting (default: FALSE)
#' - summaryforMultipleRows: aggregation function (default: max)
#' - calculateAnomalyScores: anomaly detection setting (default: FALSE)
#' - anomalyModelFeatures: anomaly model features (default: c())
#' - anomalyModelFeatureTemporal: temporal features (default: c())
#' - removeMissingFeatures: missing feature threshold (default: 0.5)
#' - anomalyModelFeatureCount: feature count for anomaly model (default: 100)
#' - runOrder: run order data (default: NULL)
#' - n_trees: number of trees (default: 100)
#' - max_depth: max tree depth (default: "auto")
#' - numberOfCores: number of cores (default: 1)
#' - use_log_file: logging setting (default: TRUE)
#' - append: append setting (default: FALSE)
#' - verbose: verbose setting (default: TRUE)
#' - log_file_path: log file path (default: NULL)
#' - excludedFromQuantificationFilter: filter setting (default: NULL)
#' @return NULL (throws error if validation fails)
#' @keywords internal
.validateMSstatsConverterParameters = function(config) {

# Ensure config is a list
if (!is.list(config)) {
stop("Config must be a list")
}

# Define all defaults in one place
defaults = list(
input = NULL,
annotation = NULL,
intensity = NULL,
excludedFromQuantificationFilter = NULL,
filter_with_Qvalue = FALSE,
qvalue_cutoff = 0.01,
useUniquePeptide = TRUE,
removeFewMeasurements = TRUE,
removeProtein_with1Feature = FALSE,
summaryforMultipleRows = max,
calculateAnomalyScores = FALSE,
anomalyModelFeatures = c(),
anomalyModelFeatureTemporal = c(),
removeMissingFeatures = 0.5,
anomalyModelFeatureCount = 100,
runOrder = NULL,
n_trees = 100,
max_depth = "auto",
numberOfCores = 1,
use_log_file = TRUE,
append = FALSE,
verbose = TRUE,
log_file_path = NULL
)

# Merge config with defaults (config values override defaults)
config = modifyList(defaults, config)

# Input data validation (fail immediately if data is invalid)
if (is.null(config$input)) {
stop("Input data cannot be NULL")
}

if (is.character(config$input)) {
if (!file.exists(config$input)) {
stop("Input file does not exist: ", config$input)
}
# Quick file size check for very large files
file_size_mb = file.size(config$input) / (1024^2)
if (file_size_mb > 1000) { # Warn for files > 1GB
warning("Large input file detected (", round(file_size_mb, 1), " MB). ",
"Consider validating parameters on a subset first.")
}
} else if (is.data.frame(config$input) || data.table::is.data.table(config$input)) {
# Quick structural validation
if (nrow(config$input) == 0) {
stop("Input data is empty (0 rows)")
}
if (ncol(config$input) == 0) {
stop("Input data has no columns")
}
} else {
stop("Input must be a file path, data.frame, or data.table")
}

# Annotation validation
if (!is.null(config$annotation)) {
if (is.character(config$annotation)) {
if (!file.exists(config$annotation)) {
stop("Annotation file does not exist: ", config$annotation)
}
} else if (!is.data.frame(config$annotation) && !data.table::is.data.table(config$annotation)) {
stop("Annotation must be NULL, a file path, data.frame, or data.table")
}
}

# Intensity validation (if provided)
if (!is.null(config$intensity)) {
checkmate::assertString(config$intensity)
}

# Q-value filtering parameters
checkmate::assertLogical(config$filter_with_Qvalue, len = 1)
checkmate::assertNumber(config$qvalue_cutoff, lower = 0, upper = 1)

# Common processing parameters
checkmate::assertLogical(config$useUniquePeptide, len = 1)
checkmate::assertLogical(config$removeFewMeasurements, len = 1)
checkmate::assertLogical(config$removeProtein_with1Feature, len = 1)

# Aggregation function validation
if (!is.null(config$summaryforMultipleRows)) {
checkmate::assertFunction(config$summaryforMultipleRows)
}

# Core system parameters
checkmate::assertInt(config$numberOfCores, lower = 1)
checkmate::assertLogical(config$use_log_file, len = 1)
checkmate::assertLogical(config$append, len = 1)
checkmate::assertLogical(config$verbose, len = 1)

# Converter-specific boolean parameters (if provided)
if (!is.null(config$excludedFromQuantificationFilter)) {
checkmate::assertLogical(config$excludedFromQuantificationFilter, len = 1)
}

checkmate::assertLogical(config$calculateAnomalyScores, len = 1)

# Anomaly detection parameter validation (converter-specific)
if (config$calculateAnomalyScores) {
# These validations only matter if anomaly detection is enabled
if (length(config$anomalyModelFeatures) == 0) {
stop("anomalyModelFeatures cannot be empty when calculateAnomalyScores=TRUE")
}
checkmate::assertCharacter(config$anomalyModelFeatures, min.len = 1)

if (length(config$anomalyModelFeatureTemporal) > 0) {
if (length(config$anomalyModelFeatureTemporal) != length(config$anomalyModelFeatures)) {
stop("anomalyModelFeatureTemporal must have same length as anomalyModelFeatures or be empty")
}
valid_temporal = c("mean_decrease", "mean_increase", "dispersion_increase")
invalid_temporal = config$anomalyModelFeatureTemporal[
!is.null(config$anomalyModelFeatureTemporal) &
!config$anomalyModelFeatureTemporal %in% valid_temporal
]
if (length(invalid_temporal) > 0) {
stop("Invalid temporal directions: ", paste(invalid_temporal, collapse = ", "),
". Must be one of: ", paste(valid_temporal, collapse = ", "), " or NULL")
}
}

checkmate::assertInt(config$n_trees, lower = 1)
if (is.character(config$max_depth)) {
checkmate::assertChoice(config$max_depth, choices = "auto")
} else {
checkmate::assertInt(config$max_depth, lower = 1)
}
checkmate::assertInt(config$anomalyModelFeatureCount, lower = 1)
checkmate::assertNumber(config$removeMissingFeatures, lower = 0, upper = 1)

if (!is.null(config$runOrder)) {
if (!is.data.frame(config$runOrder) && !data.table::is.data.table(config$runOrder)) {
stop("runOrder must be a data.frame or data.table")
}
required_cols = c("Run", "Order")
missing_cols = setdiff(required_cols, colnames(config$runOrder))
if (length(missing_cols) > 0) {
stop("runOrder is missing required columns: ", paste(missing_cols, collapse = ", "))
}
if (!is.numeric(config$runOrder$Order)) {
stop("runOrder$Order must be numeric")
}
}
} else {
# When anomaly detection is disabled, these parameters are ignored but warn user
if (length(config$anomalyModelFeatures) > 0) {
warning("anomalyModelFeatures provided but calculateAnomalyScores=FALSE, ignoring")
}
if (!is.null(config$runOrder)) {
warning("runOrder provided but calculateAnomalyScores=FALSE, ignoring")
}
}

# Log file validation
if (!is.null(config$log_file_path)) {
checkmate::assertString(config$log_file_path)
log_dir = dirname(config$log_file_path)
if (!dir.exists(log_dir)) {
stop("Log file directory does not exist: ", log_dir)
}
if (!file.access(log_dir, mode = 2) == 0) {
stop("No write permission for log file directory: ", log_dir)
}
}
}
3 changes: 2 additions & 1 deletion R/utils_classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ setOldClass("MSstatsValidated", S4Class = "MSstatsValidated")

#' Output format for further analysis by MSstats
#' @param input data.table
#' @param anomaly_metrics character vector of quality metric column names to be used as features in an anomaly detection model
#' @importFrom methods new
#' @return object of class MSstatsValidated that inherits from data.frame
#' @keywords internal
.MSstatsFormat = function(input, anomaly_metrics) {
.MSstatsFormat = function(input, anomaly_metrics = c()) {
input = .selectMSstatsColumns(input, anomaly_metrics)
new("MSstatsValidated", as.data.frame(input))
}
Expand Down
8 changes: 6 additions & 2 deletions R/utils_clean_features.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
#' @param feature_columns character vector of names of columns that define features.
#' @param cleaning_control named list of two or three elements.
#' See the documentation for `MSstatsImport` for details.
#' @param anomaly_metrics character vector of quality metric column names to be
#' used as features in an anomaly detection model.
#' @return `data.table`
#' @keywords internal
.cleanByFeature = function(input, feature_columns,
cleaning_control, anomaly_metrics) {
cleaning_control, anomaly_metrics = c()) {
if (is.element("Channel", colnames(input))) {
input = .filterFewMeasurements(
input, 0,
Expand Down Expand Up @@ -86,7 +88,9 @@
#' Summarize multiple measurements per feature in a single run
#' @param input `data.table` pre-processed by one of the .cleanRaw* functions.
#' @param aggregator function that will be used to aggregate duplicated values.
#' @param feature_columns chr, vector of names of columns that define features.
#' @param feature_columns chr, vector of names of columns that define features.
#' @param anomaly_metrics character vector of quality metric column names
#' to be used as features in an anomaly detection model.
#' @return `data.table`
#' @keywords internal
.summarizeMultipleMeasurements = function(input, aggregator,
Expand Down
33 changes: 21 additions & 12 deletions inst/tinytest/test_cleanRaw.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,22 +193,31 @@ expect_true(nrow(sm_cleaned) > 0)
expect_error(MSstatsConvert::MSstatsClean(spectromine_import_error))
# Spectronaut
spectronaut_input = data.table::fread("./raw_data/Spectronaut/spectronaut_input.csv")
spectronaut_input2 = data.table::copy(spectronaut_input)
spectronaut_input2$F.ExcludedFromQuantification = ifelse(
spectronaut_input2$F.ExcludedFromQuantification,
"True", "False"
)
spectronaut_import = MSstatsConvert::MSstatsImport(list(input = spectronaut_input),
"MSstats", "Spectronaut")
spectronaut_import2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input2),
"MSstats", "Spectronaut")
sn_cleaned = MSstatsConvert::MSstatsClean(spectronaut_import,
intensity = "PeakArea")
sn_cleaned2 = MSstatsConvert::MSstatsClean(spectronaut_import2,
intensity = "PeakArea")
expect_equal(ncol(sn_cleaned), 11)
intensity = "PeakArea",
calculateAnomalyScores = FALSE,
anomalyModelFeatures = c())
expect_equal(ncol(sn_cleaned), 12)
expect_true(nrow(sn_cleaned) > 0)
expect_equal(sn_cleaned, sn_cleaned2)

# Test new peak quality columns
spectronaut_input_2 = spectronaut_input
spectronaut_input_2$`FG.ShapeQualityScore (MS2)` = 1
spectronaut_input_2$`FG.ShapeQualityScore (MS1)` = 1
spectronaut_input_2$`EG.ApexRT` = 1
spectronaut_input_2$`F.PossibleInterference` = TRUE
spectronaut_import_2 = MSstatsConvert::MSstatsImport(list(input = spectronaut_input_2),
"MSstats", "Spectronaut")
sn_cleaned_2 = MSstatsConvert::MSstatsClean(spectronaut_import_2,
intensity = "PeakArea",
calculateAnomalyScores = TRUE,
anomalyModelFeatures = c("FGShapeQualityScore(MS2)",
"FGShapeQualityScore(MS1)",
"EGApexRT"))
expect_equal(ncol(sn_cleaned_2), 16)
expect_true(nrow(sn_cleaned_2) > 0)

# Metamorpheus
metamorpheus_table = data.table::fread("./raw_data/Metamorpheus/QuantifiedPeaks.tsv")
Expand Down
Loading