Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 96 additions & 20 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,49 @@
#' @return data.table
#' @importFrom stats na.omit
#' @keywords internal
.cleanRawDIANN <- function(msstats_object, MBR = TRUE,
.cleanRawDIANN <- function(msstats_object, MBR = TRUE,
quantificationColumn = "FragmentQuantCorrected",
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01,
calculateAnomalyScores = FALSE,
anomalyModelFeatures = c()) {
calculateAnomalyScores = FALSE,
anomalyModelFeatures = c(),
labeledAminoAcids = NULL) {
dn_input <- getInputFile(msstats_object, "input")
dn_input <- data.table::as.data.table(dn_input)

# Process quantification columns
quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input, quantificationColumn)

# Add missing columns
dn_input <- .cleanDIANNAddMissingColumns(dn_input)


has_channel <- !is.null(labeledAminoAcids) && "Channel" %in% colnames(dn_input)

# Select required columns
dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR,
calculateAnomalyScores,
anomalyModelFeatures)

anomalyModelFeatures,
has_channel = has_channel)

# Split concatenated values
dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)

# Process fragment information
dn_input <- .cleanDIANNProcessFragmentInfo(dn_input, quantificationColumn)

# Clean and filter data
dn_input <- .cleanDIANNCleanAndFilterData(dn_input, MBR, quantificationColumn,
dn_input <- .cleanDIANNCleanAndFilterData(dn_input, MBR, quantificationColumn,
global_qvalue_cutoff,
qvalue_cutoff,
qvalue_cutoff,
pg_qvalue_cutoff)

# Rename columns
dn_input <- .cleanDIANNRenameColumns(dn_input, quantificationColumn)


# Assign IsotopeLabelType for protein turnover
dn_input <- .assignDIANNIsotopeLabelType(dn_input, labeledAminoAcids, has_channel)

.logSuccess("DIANN", "clean")
dn_input
}
Expand Down Expand Up @@ -84,17 +91,22 @@
#' @noRd
.cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR,
calculateAnomalyScores,
anomalyModelFeatures) {
base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', quantificationColumn, 'QValue',
anomalyModelFeatures,
has_channel = FALSE) {
base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', quantificationColumn, 'QValue',
'PrecursorMz', 'FragmentInfo', 'Run')


if (has_channel) {
base_cols <- c(base_cols, 'Channel')
}

mbr_cols <- if (MBR) {
c('LibQValue', 'LibPGQValue')
} else {
c('GlobalQValue', 'GlobalPGQValue')
}

qual_cols <- if (calculateAnomalyScores) {
anomalyModelFeatures
} else {
Expand Down Expand Up @@ -207,6 +219,70 @@
return(dn_input)
}

#' Assign IsotopeLabelType for DIANN protein turnover workflows.
#'
#' Dispatches to one of two classification paths depending on \code{has_channel}:
#'
#' \strong{Channel-based path} (\code{has_channel = TRUE}): \code{Channel}
#' values are mapped directly to \code{IsotopeLabelType} (\code{"H"} →
#' \code{"H"}, \code{"L"} → \code{"L"}, anything else → \code{NA}), and the
#' \code{Channel} column is then dropped. \code{labeledAminoAcids} acts solely
#' as the opt-in flag that enables this path; the amino acid codes are
#' \strong{not} used to validate or filter \code{ModifiedSequence}.
#'
#' \strong{ModifiedSequence-parsing path} (\code{has_channel = FALSE}):
#' \code{PeptideSequence} (the retained \code{ModifiedSequence}) is matched
#' against SILAC suffixes of the form \code{(SILAC-<AA>-H)} or
#' \code{(SILAC-<AA>-L)}, where \code{<AA>} is any code in
#' \code{labeledAminoAcids}. Sequences carrying neither suffix receive
#' \code{IsotopeLabelType = NA}. The SILAC suffix is stripped from
#' \code{PeptideSequence} after classification.
#'
#' @param dn_input \code{data.table} after column renaming.
#' @param labeledAminoAcids Character vector of single-letter amino acid codes
#' (e.g. \code{c("K")} or \code{c("K", "R")}), or \code{NULL} to skip
#' protein-turnover mode entirely (backwards-compatible default).
#' @param has_channel Logical; \code{TRUE} when the raw input contained a
#' \code{Channel} column that was retained through
#' \code{.cleanDIANNSelectRequiredColumns}.
#' @return \code{dn_input} with \code{IsotopeLabelType} column added (and
#' \code{Channel} removed when \code{has_channel} is \code{TRUE}).
#' @keywords internal
#' @noRd
.assignDIANNIsotopeLabelType <- function(dn_input, labeledAminoAcids, has_channel) {
IsotopeLabelType = Channel = PeptideSequence = NULL

if (is.null(labeledAminoAcids)) {
return(dn_input)
}

if (has_channel) {
dn_input[, IsotopeLabelType := data.table::fcase(
Channel == "H", "H",
Channel == "L", "L",
default = NA_character_
)]
dn_input[, Channel := NULL]
} else {
aa_pattern <- paste0(labeledAminoAcids, collapse = "|")
heavy_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-H\\)")
light_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-L\\)")
strip_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-[HL]\\)")

dn_input <- .classifyIsotopeLabelType(dn_input, heavy_regex, light_regex)
dn_input[, PeptideSequence := gsub(strip_regex, "", PeptideSequence, perl = TRUE)]
}

if (all(is.na(dn_input[["IsotopeLabelType"]]))) {
warning("labeledAminoAcids was provided but no rows were classified as H or L. ",
"Check that the input contains either a Channel column with H/L values ",
"or ModifiedSequence entries with (SILAC-<AA>-H)/(SILAC-<AA>-L) suffixes.")
}

dn_input
}
Comment thread
tonywu1999 marked this conversation as resolved.


#' Rename columns to standardized names
#' @param dn_input data.table input
#' @param quantificationColumn quantification column name
Expand Down
29 changes: 11 additions & 18 deletions R/clean_Spectronaut.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,32 +183,25 @@
#' @keywords internal
#' @noRd
.assignSpectronautIsotopeLabelType = function(spec_input, heavyLabels) {
IsotopeLabelType = PeptideSequence = StrippedSequence = NULL
PeptideSequence = NULL
if (is.null(heavyLabels)) {
return(spec_input)
}

bare_amino_acids = sub("\\[.*\\]", "", heavyLabels)
bare_amino_acids_pattern = paste0(bare_amino_acids, collapse = "|")
heavy_pattern = paste0(heavyLabels, collapse = "|")
heavy_brackets_escaped_pattern = paste(
gsub("([\\[\\]])", "\\\\\\1", heavy_pattern, perl = TRUE),
labeled_aa_regex = paste0(bare_amino_acids, collapse = "|")
heavy_regex = paste(
gsub("([\\[\\]])", "\\\\\\1", heavyLabels, perl = TRUE),
collapse = "|"
)

spec_input[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]

spec_input[, IsotopeLabelType := data.table::fcase(
grepl(heavy_brackets_escaped_pattern, PeptideSequence, perl = TRUE), "H",
grepl(bare_amino_acids_pattern, StrippedSequence, perl = TRUE), "L",
default = NA_character_
)]

spec_input[, StrippedSequence := NULL]


spec_input = .classifyIsotopeLabelType(spec_input, heavy_regex,
labeled_aa_regex = labeled_aa_regex)

for (i in seq_along(heavyLabels)) {
escaped = gsub("([\\[\\]])", "\\\\\\1", heavyLabels[i], perl = TRUE)
spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i], PeptideSequence, perl = TRUE)]
spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i],
PeptideSequence, perl = TRUE)]
}
spec_input
}
72 changes: 51 additions & 21 deletions R/converters_DIANNtoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,33 @@
#' @param removeFewMeasurements should proteins with few measurements be removed
#' @param removeOxidationMpeptides should peptides with oxidation be removed
#' @param removeProtein_with1Feature should proteins with a single feature be removed
#' @param labeledAminoAcids Character vector of single-letter amino acid codes
#' that carry the SILAC label in protein turnover experiments, e.g.
#' \code{c("K")} or \code{c("K", "R")}. Supplying this vector opts in to
#' protein-turnover mode; the exact amino acids determine behaviour only in the
#' \code{ModifiedSequence}-parsing path described below.
#'
#' \strong{Channel-based path} (DIA-NN 2.x exports that include a
#' \code{Channel} column): when \code{labeledAminoAcids} is non-\code{NULL}
#' \emph{and} the input contains a \code{Channel} column, \code{Channel} values
#' are mapped directly to \code{IsotopeLabelType} (\code{"H"} → \code{"H"},
#' \code{"L"} → \code{"L"}, anything else → \code{NA}). The amino acid codes
#' in \code{labeledAminoAcids} are \strong{not} used to validate or filter
#' \code{ModifiedSequence} in this path.
#'
#' \strong{ModifiedSequence-parsing path} (DIA-NN 1.x exports without a
#' \code{Channel} column): when \code{labeledAminoAcids} is non-\code{NULL}
#' and no \code{Channel} column is present, each \code{ModifiedSequence} is
#' inspected for SILAC suffixes of the form \code{(SILAC-<AA>-H)} or
#' \code{(SILAC-<AA>-L)}, where \code{<AA>} is one of the supplied amino acid
#' codes. Matching sequences are classified as \code{"H"} or \code{"L"};
#' sequences carrying neither suffix receive \code{IsotopeLabelType = NA}.
#' The SILAC suffix is then stripped from \code{PeptideSequence}.
#'
#' When \code{NULL} (default), protein-turnover mode is disabled and all
#' peptides receive \code{IsotopeLabelType = "Light"}.
#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
#' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
#' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
Expand Down Expand Up @@ -63,19 +88,20 @@
DIANNtoMSstatsFormat = function(
input, annotation = NULL,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01,
useUniquePeptide = TRUE,
useUniquePeptide = TRUE,
removeFewMeasurements = TRUE,
removeOxidationMpeptides = TRUE,
removeOxidationMpeptides = TRUE,
removeProtein_with1Feature = TRUE,
MBR = TRUE,
MBR = TRUE,
labeledAminoAcids = NULL,
quantificationColumn = "FragmentQuantCorrected",
calculateAnomalyScores=FALSE, anomalyModelFeatures=c(),
anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5,
anomalyModelFeatureCount=100,
runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1,
use_log_file = TRUE, append = FALSE,
runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1,
use_log_file = TRUE, append = FALSE,
verbose = TRUE, log_file_path = NULL,
...) {
MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
Expand All @@ -86,42 +112,46 @@ DIANNtoMSstatsFormat = function(
input = MSstatsConvert::MSstatsImport(list(input = input),
"MSstats", "DIANN")

input = MSstatsConvert::MSstatsClean(input, MBR = MBR,
input = MSstatsConvert::MSstatsClean(input, MBR = MBR,
quantificationColumn = quantificationColumn,
global_qvalue_cutoff = global_qvalue_cutoff,
qvalue_cutoff = qvalue_cutoff,
qvalue_cutoff = qvalue_cutoff,
pg_qvalue_cutoff = pg_qvalue_cutoff,
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures)
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures,
labeledAminoAcids = labeledAminoAcids)
annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)

decoy_filter = list(col_name = "ProteinName",
pattern = c("DECOY", "Decoys"),
filter = T,
filter = T,
drop_column = FALSE)
oxidation_filter = list(col_name = "PeptideSequence",
pattern = "\\(UniMod\\:35\\)",
filter = removeOxidationMpeptides,
drop_column = FALSE)

feature_columns = c("PeptideSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge")
# browser()
preprocess_feature_columns = if ("IsotopeLabelType" %in% colnames(input))
c(feature_columns, "IsotopeLabelType") else feature_columns
fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input))
list() else list("IsotopeLabelType" = "Light")

input = MSstatsConvert::MSstatsPreprocess(
input,
annotation,
feature_columns,
input,
annotation,
preprocess_feature_columns,
remove_shared_peptides = useUniquePeptide,
remove_single_feature_proteins = removeProtein_with1Feature,
exact_filtering = NULL,
pattern_filtering = list(decoy = decoy_filter,
pattern_filtering = list(decoy = decoy_filter,
oxidation = oxidation_filter),
aggregate_isotopic = FALSE,
feature_cleaning = list(
remove_features_with_few_measurements = removeFewMeasurements,
summarize_multiple_psms = max),
columns_to_fill = list(Fraction = 1,
IsotopeLabelType = "Light"),
columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type),
anomaly_metrics = anomalyModelFeatures)
input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
# browser()
Expand Down
54 changes: 54 additions & 0 deletions R/utils_clean_features.R
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,60 @@
}


#' Classify IsotopeLabelType from peptide sequence patterns.
#'
#' Shared core logic for protein turnover workflows in both Spectronaut and
#' DIANN converters. Each peptide is classified as heavy (\code{"H"}), light
#' (\code{"L"}), or unlabeled (\code{NA}) by matching regex patterns against
#' the \code{PeptideSequence} column.
#'
#' Two modes are supported:
#' \describe{
#' \item{Spectronaut mode}{Pass \code{labeled_aa_regex}. The sequence is
#' first stripped of all bracket modifications; light is inferred when the
#' bare labeled amino acid is present but the heavy bracket form is absent.}
#' \item{DIANN mode}{Pass \code{light_regex}. Both heavy and light patterns
#' are matched directly against the modified sequence; absence of either
#' yields \code{NA}.}
#' }
#'
#' @param dt \code{data.table} with a \code{PeptideSequence} column.
#' @param heavy_regex Perl-compatible regex matching heavy-labeled sequences.
#' @param light_regex Perl-compatible regex explicitly matching light-labeled
#' sequences. Required for DIANN mode; must be non-\code{NULL}.
#' @param labeled_aa_regex Perl-compatible regex for bare labeled amino acids
#' applied to the bracket-stripped sequence to detect light peptides.
#' Required for Spectronaut mode; must be non-\code{NULL}.
#' Exactly one of \code{light_regex} and \code{labeled_aa_regex} must be
#' supplied.
#' @return \code{dt} with \code{IsotopeLabelType} column added or updated.
#' @keywords internal
#' @noRd
.classifyIsotopeLabelType = function(dt, heavy_regex,
light_regex = NULL,
labeled_aa_regex = NULL) {
IsotopeLabelType = PeptideSequence = StrippedSequence = NULL

if (!is.null(labeled_aa_regex)) {
dt[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]
dt[, IsotopeLabelType := data.table::fcase(
grepl(heavy_regex, PeptideSequence, perl = TRUE), "H",
grepl(labeled_aa_regex, StrippedSequence, perl = TRUE), "L",
default = NA_character_
)]
dt[, StrippedSequence := NULL]
} else if (!is.null(light_regex)) {
dt[, IsotopeLabelType := data.table::fcase(
grepl(heavy_regex, PeptideSequence, perl = TRUE), "H",
grepl(light_regex, PeptideSequence, perl = TRUE), "L",
default = NA_character_
)]
}

dt
}


#' Fix invalid intensities: infinite to NA, between 0 and 1 to 0
#' @param input data.table
#' @return data.table
Expand Down
Binary file added inst/tinytest/raw_data/DIANN/PXD055942.parquet
Binary file not shown.
Loading
Loading