Vitek-Lab · tonywu1999 · Apr 17, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
@@ -4,42 +4,49 @@
 #' @return data.table
 #' @importFrom stats na.omit
 #' @keywords internal
-.cleanRawDIANN <- function(msstats_object, MBR = TRUE, 
+.cleanRawDIANN <- function(msstats_object, MBR = TRUE,
                            quantificationColumn = "FragmentQuantCorrected",
                            global_qvalue_cutoff = 0.01,
-                           qvalue_cutoff = 0.01, 
+                           qvalue_cutoff = 0.01,
                            pg_qvalue_cutoff = 0.01,
-                           calculateAnomalyScores = FALSE, 
-                           anomalyModelFeatures = c()) {
+                           calculateAnomalyScores = FALSE,
+                           anomalyModelFeatures = c(),
+                           labeledAminoAcids = NULL) {
     dn_input <- getInputFile(msstats_object, "input")
     dn_input <- data.table::as.data.table(dn_input)
-    
+
     # Process quantification columns
     quantificationColumn <- .cleanDIANNProcessQuantificationColumns(dn_input, quantificationColumn)
-    
+
     # Add missing columns
     dn_input <- .cleanDIANNAddMissingColumns(dn_input)
-
+
+    has_channel <- !is.null(labeledAminoAcids) && "Channel" %in% colnames(dn_input)
+
     # Select required columns
     dn_input <- .cleanDIANNSelectRequiredColumns(dn_input, quantificationColumn, MBR,
                                                  calculateAnomalyScores,
-                                                 anomalyModelFeatures)
-
+                                                 anomalyModelFeatures,
+                                                 has_channel = has_channel)
+
     # Split concatenated values
     dn_input <- .cleanDIANNSplitConcatenatedValues(dn_input, quantificationColumn)
-    
+
     # Process fragment information
     dn_input <- .cleanDIANNProcessFragmentInfo(dn_input, quantificationColumn)
-    
+
     # Clean and filter data
-    dn_input <- .cleanDIANNCleanAndFilterData(dn_input, MBR, quantificationColumn, 
+    dn_input <- .cleanDIANNCleanAndFilterData(dn_input, MBR, quantificationColumn,
                                               global_qvalue_cutoff,
-                                              qvalue_cutoff, 
+                                              qvalue_cutoff,
                                               pg_qvalue_cutoff)
-    
+
     # Rename columns
     dn_input <- .cleanDIANNRenameColumns(dn_input, quantificationColumn)
-
+
+    # Assign IsotopeLabelType for protein turnover
+    dn_input <- .assignDIANNIsotopeLabelType(dn_input, labeledAminoAcids, has_channel)
+
     .logSuccess("DIANN", "clean")
     dn_input
 }
@@ -84,17 +91,22 @@
 #' @noRd
 .cleanDIANNSelectRequiredColumns <- function(dn_input, quantificationColumn, MBR,
                                              calculateAnomalyScores,
-                                             anomalyModelFeatures) {
-    base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', 
-                   'PrecursorCharge', quantificationColumn, 'QValue', 
+                                             anomalyModelFeatures,
+                                             has_channel = FALSE) {
+    base_cols <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
+                   'PrecursorCharge', quantificationColumn, 'QValue',
                    'PrecursorMz', 'FragmentInfo', 'Run')
-
+
+    if (has_channel) {
+        base_cols <- c(base_cols, 'Channel')
+    }
+
     mbr_cols <- if (MBR) {
         c('LibQValue', 'LibPGQValue')
     } else {
         c('GlobalQValue', 'GlobalPGQValue')
     }
-    
+
     qual_cols <- if (calculateAnomalyScores) {
         anomalyModelFeatures
     } else {
@@ -207,6 +219,70 @@
     return(dn_input)
 }
 
+#' Assign IsotopeLabelType for DIANN protein turnover workflows.
+#'
+#' Dispatches to one of two classification paths depending on \code{has_channel}:
+#'
+#' \strong{Channel-based path} (\code{has_channel = TRUE}): \code{Channel}
+#' values are mapped directly to \code{IsotopeLabelType} (\code{"H"} →
+#' \code{"H"}, \code{"L"} → \code{"L"}, anything else → \code{NA}), and the
+#' \code{Channel} column is then dropped.  \code{labeledAminoAcids} acts solely
+#' as the opt-in flag that enables this path; the amino acid codes are
+#' \strong{not} used to validate or filter \code{ModifiedSequence}.
+#'
+#' \strong{ModifiedSequence-parsing path} (\code{has_channel = FALSE}):
+#' \code{PeptideSequence} (the retained \code{ModifiedSequence}) is matched
+#' against SILAC suffixes of the form \code{(SILAC-<AA>-H)} or
+#' \code{(SILAC-<AA>-L)}, where \code{<AA>} is any code in
+#' \code{labeledAminoAcids}.  Sequences carrying neither suffix receive
+#' \code{IsotopeLabelType = NA}.  The SILAC suffix is stripped from
+#' \code{PeptideSequence} after classification.
+#'
+#' @param dn_input \code{data.table} after column renaming.
+#' @param labeledAminoAcids Character vector of single-letter amino acid codes
+#'   (e.g. \code{c("K")} or \code{c("K", "R")}), or \code{NULL} to skip
+#'   protein-turnover mode entirely (backwards-compatible default).
+#' @param has_channel Logical; \code{TRUE} when the raw input contained a
+#'   \code{Channel} column that was retained through
+#'   \code{.cleanDIANNSelectRequiredColumns}.
+#' @return \code{dn_input} with \code{IsotopeLabelType} column added (and
+#'   \code{Channel} removed when \code{has_channel} is \code{TRUE}).
+#' @keywords internal
+#' @noRd
+.assignDIANNIsotopeLabelType <- function(dn_input, labeledAminoAcids, has_channel) {
+    IsotopeLabelType = Channel = PeptideSequence = NULL
+
+    if (is.null(labeledAminoAcids)) {
+        return(dn_input)
+    }
+
+    if (has_channel) {
+        dn_input[, IsotopeLabelType := data.table::fcase(
+            Channel == "H", "H",
+            Channel == "L", "L",
+            default = NA_character_
+        )]
+        dn_input[, Channel := NULL]
+    } else {
+        aa_pattern <- paste0(labeledAminoAcids, collapse = "|")
+        heavy_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-H\\)")
+        light_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-L\\)")
+        strip_regex <- paste0("\\(SILAC-(?:", aa_pattern, ")-[HL]\\)")
+
+        dn_input <- .classifyIsotopeLabelType(dn_input, heavy_regex, light_regex)
+        dn_input[, PeptideSequence := gsub(strip_regex, "", PeptideSequence, perl = TRUE)]
+    }
+
+    if (all(is.na(dn_input[["IsotopeLabelType"]]))) {
+        warning("labeledAminoAcids was provided but no rows were classified as H or L. ",
+                "Check that the input contains either a Channel column with H/L values ",
+                "or ModifiedSequence entries with (SILAC-<AA>-H)/(SILAC-<AA>-L) suffixes.")
+    }
+
+    dn_input
+}
+
+
 #' Rename columns to standardized names
 #' @param dn_input data.table input
 #' @param quantificationColumn quantification column name

diff --git a/R/clean_Spectronaut.R b/R/clean_Spectronaut.R
@@ -183,32 +183,25 @@
 #' @keywords internal
 #' @noRd
 .assignSpectronautIsotopeLabelType = function(spec_input, heavyLabels) {
-    IsotopeLabelType = PeptideSequence = StrippedSequence = NULL
+    PeptideSequence = NULL
     if (is.null(heavyLabels)) {
         return(spec_input)
     }
-    
+
     bare_amino_acids = sub("\\[.*\\]", "", heavyLabels)
-    bare_amino_acids_pattern = paste0(bare_amino_acids, collapse = "|")
-    heavy_pattern = paste0(heavyLabels, collapse = "|")
-    heavy_brackets_escaped_pattern = paste(
-        gsub("([\\[\\]])", "\\\\\\1", heavy_pattern, perl = TRUE),
+    labeled_aa_regex = paste0(bare_amino_acids, collapse = "|")
+    heavy_regex = paste(
+        gsub("([\\[\\]])", "\\\\\\1", heavyLabels, perl = TRUE),
         collapse = "|"
     )
-
-    spec_input[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]
-
-    spec_input[, IsotopeLabelType := data.table::fcase(
-        grepl(heavy_brackets_escaped_pattern, PeptideSequence, perl = TRUE), "H",
-        grepl(bare_amino_acids_pattern, StrippedSequence, perl = TRUE), "L",
-        default = NA_character_
-    )]
-
-    spec_input[, StrippedSequence := NULL]
-
+
+    spec_input = .classifyIsotopeLabelType(spec_input, heavy_regex,
+                                            labeled_aa_regex = labeled_aa_regex)
+
     for (i in seq_along(heavyLabels)) {
         escaped = gsub("([\\[\\]])", "\\\\\\1", heavyLabels[i], perl = TRUE)
-        spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i], PeptideSequence, perl = TRUE)]
+        spec_input[, PeptideSequence := gsub(escaped, bare_amino_acids[i],
+                                             PeptideSequence, perl = TRUE)]
     }
     spec_input
 }
diff --git a/R/converters_DIANNtoMSstatsFormat.R b/R/converters_DIANNtoMSstatsFormat.R
@@ -19,8 +19,33 @@
 #' @param removeFewMeasurements should proteins with few measurements be removed
 #' @param removeOxidationMpeptides should peptides with oxidation be removed
 #' @param removeProtein_with1Feature should proteins with a single feature be removed
+#' @param labeledAminoAcids Character vector of single-letter amino acid codes
+#' that carry the SILAC label in protein turnover experiments, e.g.
+#' \code{c("K")} or \code{c("K", "R")}.  Supplying this vector opts in to
+#' protein-turnover mode; the exact amino acids determine behaviour only in the
+#' \code{ModifiedSequence}-parsing path described below.
+#'
+#' \strong{Channel-based path} (DIA-NN 2.x exports that include a
+#' \code{Channel} column): when \code{labeledAminoAcids} is non-\code{NULL}
+#' \emph{and} the input contains a \code{Channel} column, \code{Channel} values
+#' are mapped directly to \code{IsotopeLabelType} (\code{"H"} → \code{"H"},
+#' \code{"L"} → \code{"L"}, anything else → \code{NA}).  The amino acid codes
+#' in \code{labeledAminoAcids} are \strong{not} used to validate or filter
+#' \code{ModifiedSequence} in this path.
+#'
+#' \strong{ModifiedSequence-parsing path} (DIA-NN 1.x exports without a
+#' \code{Channel} column): when \code{labeledAminoAcids} is non-\code{NULL}
+#' and no \code{Channel} column is present, each \code{ModifiedSequence} is
+#' inspected for SILAC suffixes of the form \code{(SILAC-<AA>-H)} or
+#' \code{(SILAC-<AA>-L)}, where \code{<AA>} is one of the supplied amino acid
+#' codes.  Matching sequences are classified as \code{"H"} or \code{"L"};
+#' sequences carrying neither suffix receive \code{IsotopeLabelType = NA}.
+#' The SILAC suffix is then stripped from \code{PeptideSequence}.
+#'
+#' When \code{NULL} (default), protein-turnover mode is disabled and all
+#' peptides receive \code{IsotopeLabelType = "Light"}.
 #' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
-#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
+#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
 #' Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.
 #' @param calculateAnomalyScores Default is FALSE. If TRUE, will run anomaly detection model and calculate anomaly scores for each feature. Used downstream to weigh measurements in differential analysis.
 #' @param anomalyModelFeatures character vector of quality metric column names to be used as features in the anomaly detection model. List must not be empty if calculateAnomalyScores=TRUE.
@@ -63,19 +88,20 @@
 DIANNtoMSstatsFormat = function(
     input, annotation = NULL,
     global_qvalue_cutoff = 0.01,
-    qvalue_cutoff = 0.01, 
+    qvalue_cutoff = 0.01,
     pg_qvalue_cutoff = 0.01,
-    useUniquePeptide = TRUE, 
+    useUniquePeptide = TRUE,
     removeFewMeasurements = TRUE,
-    removeOxidationMpeptides = TRUE, 
+    removeOxidationMpeptides = TRUE,
     removeProtein_with1Feature = TRUE,
-    MBR = TRUE, 
+    MBR = TRUE,
+    labeledAminoAcids = NULL,
     quantificationColumn = "FragmentQuantCorrected",
     calculateAnomalyScores=FALSE, anomalyModelFeatures=c(),
     anomalyModelFeatureTemporal=c(), removeMissingFeatures=.5,
     anomalyModelFeatureCount=100,
-    runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1, 
-    use_log_file = TRUE, append = FALSE, 
+    runOrder=NULL, n_trees=100, max_depth="auto", numberOfCores=1,
+    use_log_file = TRUE, append = FALSE,
     verbose = TRUE, log_file_path = NULL,
     ...) {
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, 
@@ -86,42 +112,46 @@ DIANNtoMSstatsFormat = function(
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "DIANN")
 
-    input = MSstatsConvert::MSstatsClean(input, MBR = MBR, 
+    input = MSstatsConvert::MSstatsClean(input, MBR = MBR,
                                          quantificationColumn = quantificationColumn,
                                          global_qvalue_cutoff = global_qvalue_cutoff,
-                                         qvalue_cutoff = qvalue_cutoff, 
+                                         qvalue_cutoff = qvalue_cutoff,
                                          pg_qvalue_cutoff = pg_qvalue_cutoff,
-                                         calculateAnomalyScores = calculateAnomalyScores, 
-                                         anomalyModelFeatures = anomalyModelFeatures)
+                                         calculateAnomalyScores = calculateAnomalyScores,
+                                         anomalyModelFeatures = anomalyModelFeatures,
+                                         labeledAminoAcids = labeledAminoAcids)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
-    
+
     decoy_filter = list(col_name = "ProteinName",
                         pattern = c("DECOY", "Decoys"),
-                        filter = T, 
+                        filter = T,
                         drop_column = FALSE)
     oxidation_filter = list(col_name = "PeptideSequence",
                             pattern = "\\(UniMod\\:35\\)",
                             filter = removeOxidationMpeptides,
                             drop_column = FALSE)
-    
+
     feature_columns = c("PeptideSequence", "PrecursorCharge",
                         "FragmentIon", "ProductCharge")
-    # browser()
+    preprocess_feature_columns = if ("IsotopeLabelType" %in% colnames(input))
+        c(feature_columns, "IsotopeLabelType") else feature_columns
+    fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input))
+        list() else list("IsotopeLabelType" = "Light")
+
     input = MSstatsConvert::MSstatsPreprocess(
-        input, 
-        annotation, 
-        feature_columns,
+        input,
+        annotation,
+        preprocess_feature_columns,
         remove_shared_peptides = useUniquePeptide,
         remove_single_feature_proteins = removeProtein_with1Feature,
         exact_filtering = NULL,
-        pattern_filtering = list(decoy = decoy_filter, 
+        pattern_filtering = list(decoy = decoy_filter,
                                  oxidation = oxidation_filter),
         aggregate_isotopic = FALSE,
         feature_cleaning = list(
             remove_features_with_few_measurements = removeFewMeasurements,
             summarize_multiple_psms = max),
-        columns_to_fill = list(Fraction = 1,
-                               IsotopeLabelType = "Light"),
+        columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type),
         anomaly_metrics = anomalyModelFeatures)
     input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
     # browser()

diff --git a/R/utils_clean_features.R b/R/utils_clean_features.R
@@ -309,6 +309,60 @@
 }
 
 
+#' Classify IsotopeLabelType from peptide sequence patterns.
+#'
+#' Shared core logic for protein turnover workflows in both Spectronaut and
+#' DIANN converters.  Each peptide is classified as heavy (\code{"H"}), light
+#' (\code{"L"}), or unlabeled (\code{NA}) by matching regex patterns against
+#' the \code{PeptideSequence} column.
+#'
+#' Two modes are supported:
+#' \describe{
+#'   \item{Spectronaut mode}{Pass \code{labeled_aa_regex}.  The sequence is
+#'     first stripped of all bracket modifications; light is inferred when the
+#'     bare labeled amino acid is present but the heavy bracket form is absent.}
+#'   \item{DIANN mode}{Pass \code{light_regex}.  Both heavy and light patterns
+#'     are matched directly against the modified sequence; absence of either
+#'     yields \code{NA}.}
+#' }
+#'
+#' @param dt \code{data.table} with a \code{PeptideSequence} column.
+#' @param heavy_regex Perl-compatible regex matching heavy-labeled sequences.
+#' @param light_regex Perl-compatible regex explicitly matching light-labeled
+#'   sequences.  Required for DIANN mode; must be non-\code{NULL}.
+#' @param labeled_aa_regex Perl-compatible regex for bare labeled amino acids
+#'   applied to the bracket-stripped sequence to detect light peptides.
+#'   Required for Spectronaut mode; must be non-\code{NULL}.
+#'   Exactly one of \code{light_regex} and \code{labeled_aa_regex} must be
+#'   supplied.
+#' @return \code{dt} with \code{IsotopeLabelType} column added or updated.
+#' @keywords internal
+#' @noRd
+.classifyIsotopeLabelType = function(dt, heavy_regex,
+                                      light_regex = NULL,
+                                      labeled_aa_regex = NULL) {
+    IsotopeLabelType = PeptideSequence = StrippedSequence = NULL
+
+    if (!is.null(labeled_aa_regex)) {
+        dt[, StrippedSequence := gsub("\\[.*?\\]", "", PeptideSequence)]
+        dt[, IsotopeLabelType := data.table::fcase(
+            grepl(heavy_regex, PeptideSequence, perl = TRUE), "H",
+            grepl(labeled_aa_regex, StrippedSequence, perl = TRUE), "L",
+            default = NA_character_
+        )]
+        dt[, StrippedSequence := NULL]
+    } else if (!is.null(light_regex)) {
+        dt[, IsotopeLabelType := data.table::fcase(
+            grepl(heavy_regex, PeptideSequence, perl = TRUE), "H",
+            grepl(light_regex, PeptideSequence, perl = TRUE), "L",
+            default = NA_character_
+        )]
+    }
+
+    dt
+}
+
+
 #' Fix invalid intensities: infinite to NA, between 0 and 1 to 0
 #' @param input data.table
 #' @return data.table

diff --git a/inst/tinytest/raw_data/DIANN/PXD055942.parquet b/inst/tinytest/raw_data/DIANN/PXD055942.parquet