Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and
processing larger than memory data sets.
License: Artistic-2.0
Encoding: UTF-8
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
Imports:
arrow,
DBI,
Expand All @@ -23,7 +23,9 @@ Imports:
readr,
sparklyr,
utils
Suggests:
Suggests:
testthat,
mockery,
knitr,
rmarkdown
VignetteBuilder: knitr
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(MSstatsAddAnnotationBig)
export(MSstatsPreprocessBig)
export(bigDIANNtoMSstatsFormat)
export(bigFragPipetoMSstatsFormat)
export(bigSpectronauttoMSstatsFormat)
importFrom(MSstats,dataProcess)
Expand Down
290 changes: 290 additions & 0 deletions R/clean_DIANN.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
#' Read and clean a large DIANN file in chunks
#'
#' @param input_file Path to the input DIANN file
#' @param output_path Path to the output CSV file
#' @param MBR Boolean, whether MBR was used
#' @param quantificationColumn Name of the column containing intensity values
#' @param global_qvalue_cutoff Global Q-value cutoff
#' @param qvalue_cutoff Q-value cutoff
#' @param pg_qvalue_cutoff Protein group Q-value cutoff
#' @return NULL. Writes to file.
#' @keywords internal
reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
quantificationColumn = "Fragment.Quant.Corrected",
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01) {
if (grepl("csv", input_file)) {
delim = ","
} else if (grepl("tsv|xls", input_file)) {
delim = "\t"
} else {
delim <- ";"
}
Comment thread
tonywu1999 marked this conversation as resolved.

diann_chunk <- function(x, pos) cleanDIANNChunk(x,
output_path,
MBR,
quantificationColumn,
pos,
global_qvalue_cutoff,
qvalue_cutoff,
pg_qvalue_cutoff)
readr::read_delim_chunked(input_file,
Comment thread
tonywu1999 marked this conversation as resolved.
readr::DataFrameCallback$new(diann_chunk),
delim = delim,
chunk_size = 1e6)
}

#' Clean a single chunk of DIANN data
#'
#' @param input Data frame chunk
#' @param output_path Path to output file
#' @param MBR Boolean, whether MBR was used
#' @param quantificationColumn Name of intensity column
#' @param pos Chunk position (1 for first chunk, >1 for subsequent)
#' @param global_qvalue_cutoff Global Q-value cutoff
#' @param qvalue_cutoff Q-value cutoff
#' @param pg_qvalue_cutoff Protein group Q-value cutoff
#' @return NULL
#' @keywords internal
cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
global_qvalue_cutoff = 0.01,
qvalue_cutoff = 0.01,
pg_qvalue_cutoff = 0.01) {
# 1. Handle "auto" quantification column
processed <- .handleAutoQuantification(input, quantificationColumn)
input <- processed$input
quantificationColumn <- processed$quantificationColumn

# 2. Select required columns
input <- .selectDIANNColumns(input, MBR, quantificationColumn)
input <- .cleanDIANNAddMissingColumns(input)

# 3. Expand concatenated rows
input <- .expandDIANNRows(input, quantificationColumn)

# 4. Process fragment info (extract intensity, charge, ion)
input <- .processDIANNFragmentInfo(input, quantificationColumn)

# 5. Filter invalid fragments
input <- .filterDIANNFragments(input, quantificationColumn)

# 6. Standardize column names
input <- .standardizeDIANNColumns(input, quantificationColumn)

# 7. Filter by Q-values
input <- .filterDIANNByQValues(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff)

# 8. Finalize columns (select final set, add IsotopeLabelType)
input <- .finalizeDIANNColumns(input)

# 9. Write to output
.writeDIANNChunk(input, output_path, pos)

NULL
}

#' Handle automatic detection of quantification columns
#'
#' @param input Data frame
#' @param quantificationColumn Name of column or "auto"
#' @return List with input data frame and updated quantification column name
#' @keywords internal
.handleAutoQuantification <- function(input, quantificationColumn) {
if (quantificationColumn == "auto") {
fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE)
if (length(fragment_columns) == 0) {
stop("No fragment quantification columns found. Please check your input.")
}
input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";")
quantificationColumn <- "FragmentQuantCorrected"
}
list(input = input, quantificationColumn = quantificationColumn)
}

#' Select required columns from DIANN output
#'
#' @param input Data frame
#' @param MBR Boolean
#' @param quantificationColumn Name of intensity column
#' @return Data frame with selected columns
#' @keywords internal
.selectDIANNColumns <- function(input, MBR, quantificationColumn) {
base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence',
'Precursor.Charge', quantificationColumn, 'Q.Value',
'Precursor.Mz', 'Fragment.Info', 'Run')

mbr_cols <- if (MBR) {
c('Lib.Q.Value', 'Lib.PG.Q.Value')
} else {
c('Global.Q.Value', 'Global.PG.Q.Value')
}
Comment thread
tonywu1999 marked this conversation as resolved.

req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
dplyr::select(input, all_of(req_cols))
}

#' Add missing required columns
#'
#' @param input Data frame
#' @return Data frame with missing columns added
#' @keywords internal
.cleanDIANNAddMissingColumns <- function(input) {
if (!"Precursor.Mz" %in% colnames(input)) {
input <- dplyr::mutate(input, Precursor.Mz = NA)
}
if (!"Fragment.Info" %in% colnames(input)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

input <- dplyr::mutate(input, Fragment.Info = NA)
}
input
}

#' Expand rows with multiple fragments
#'
#' @param input Data frame
#' @param quantificationColumn Name of intensity column
#' @return Data frame with expanded rows
#' @keywords internal
.expandDIANNRows <- function(input, quantificationColumn) {
split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
Comment thread
tonywu1999 marked this conversation as resolved.
if (length(split_cols) > 0) {
tidyr::separate_rows(input, all_of(split_cols), sep = ";")
} else {
input
}
}

#' Process fragment information strings
#'
#' @param input Data frame
#' @param quantificationColumn Name of intensity column
#' @return Data frame with FragmentIon and ProductCharge columns
#' @keywords internal
.processDIANNFragmentInfo <- function(input, quantificationColumn) {
# Convert Intensity to Numeric from Char strings
input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])

# Generate fragment info if missing
if (all(is.na(input$Fragment.Info))) {
input <- dplyr::group_by(input, Protein.Names, Modified.Sequence, Precursor.Charge, Run)
input <- dplyr::mutate(input, Fragment.Info = paste0("Frag", dplyr::row_number()))
input <- dplyr::ungroup(input)
}

dplyr::mutate(
input,
FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),

# Extract product charge
ProductCharge = dplyr::if_else(
grepl("/", .data$Fragment.Info),
# Extract charge (number right after "/" in string), default to 1 if parsing fails
as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
1L,
missing = 1L
)
Comment on lines +179 to +186
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

find . -name "clean_DIANN.R" -type f

Repository: Vitek-Lab/MSstatsBig

Length of output: 81


🏁 Script executed:

sed -n '64,70p' ./R/clean_DIANN.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 348


🏁 Script executed:

python3 << 'EOF'
# Let's verify the behavior of stringr::str_extract and as.integer in R
# by examining what happens when the regex pattern doesn't match

# The behavior should be:
# stringr::str_extract("something/", "(?<=/)[0-9]+") -> NA (no digits after /)
# as.integer(NA) -> NA (not 1L)

# This is a R behavior question, let me create a test to understand it better

print("Testing R string extraction and type conversion behavior:")
print("Case 1: Fragment.Info with / but no digits after it")
print("  stringr::str_extract returns: NA (pattern doesn't match)")
print("  as.integer(NA) returns: NA (not 1L)")
print("")
print("Case 2: Fragment.Info with / and digits after it")
print("  stringr::str_extract returns: the digits")
print("  as.integer(digits) returns: integer value")
print("")
print("Case 3: Fragment.Info without /")
print("  Current code returns: 1L (via the else branch of if_else)")
print("")
print("So the bug is in Case 1 - when / is present but no digits follow")
print("The function would return NA instead of 1L")

EOF

Repository: Vitek-Lab/MSstatsBig

Length of output: 150


🏁 Script executed:

grep -r "ProductCharge" ./R --include="*.R" -B 2 -A 2

Repository: Vitek-Lab/MSstatsBig

Length of output: 4903


🏁 Script executed:

sed -n '50,80p' ./R/clean_DIANN.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 1088


Ensure ProductCharge defaults to 1 when parsing fails.

The current code returns NA when a / is present in Fragment.Info but no digits follow (e.g., "y1/"). The regex pattern "(?<=/)[0-9]+" fails to match, causing stringr::str_extract() to return NA, and subsequently as.integer(NA) returns NA instead of the intended default of 1L. Use dplyr::coalesce() to handle this edge case:

🔧 Suggested change
-    ProductCharge = dplyr::if_else(
-      grepl("/", .data$Fragment.Info),
-      # Extract charge (number right after "/" in string), default to 1 if parsing fails
-      as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
-      1L
-    )
+    ProductCharge = dplyr::if_else(
+      grepl("/", .data$Fragment.Info),
+      dplyr::coalesce(
+        as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+        1L
+      ),
+      1L
+    )
🤖 Prompt for AI Agents
In `@R/clean_DIANN.R` around lines 64 - 70, ProductCharge currently becomes NA
when Fragment.Info contains "/" but no digits follow because
as.integer(stringr::str_extract(...)) yields NA; modify the ProductCharge
computation (the dplyr::if_else branch that uses stringr::str_extract on
.data$Fragment.Info) to wrap the extracted value with dplyr::coalesce (or
equivalent) so that if str_extract returns NA it falls back to "1" before
coercion, ensuring the final value is 1L when parsing fails.

)
}

#' Filter invalid fragments
#'
#' @param input Data frame
#' @param quantificationColumn Name of intensity column
#' @return Filtered data frame
#' @keywords internal
.filterDIANNFragments <- function(input, quantificationColumn) {
dplyr::filter(
input,
(!grepl("NH3|H2O", .data$FragmentIon) | is.na(.data$FragmentIon)) & !is.na(.data[[quantificationColumn]])
)
}

#' Standardize column names to MSstats format
#'
#' @param input Data frame
#' @param quantificationColumn Name of intensity column
#' @return Data frame with renamed columns
#' @keywords internal
.standardizeDIANNColumns <- function(input, quantificationColumn) {
input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))

# Standardize column names
clean_quant_col <- gsub("\\.", "", quantificationColumn)
old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
'PrecursorCharge', clean_quant_col, 'QValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
'PrecursorCharge', 'Intensity', 'DetectionQValue',
'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')

current_names <- colnames(input)
names_to_rename <- intersect(current_names, old_names)

# Create a named vector for renaming in the format c(new_name = old_name)
new_names_subset <- new_names[match(names_to_rename, old_names)]
rename_map <- setNames(names_to_rename, new_names_subset)
rename_map <- rename_map[!is.na(names(rename_map))]

dplyr::rename(input, any_of(rename_map))
}

#' Filter data by Q-values
#'
#' @param input Data frame
#' @param MBR Boolean
#' @param global_qvalue_cutoff Numeric
#' @param qvalue_cutoff Numeric
#' @param pg_qvalue_cutoff Numeric
#' @return Filtered data frame
#' @keywords internal
.filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) {
input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)

if (MBR) {
dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
} else {
dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
}
}
Comment on lines +241 to +249
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing Q-value columns will cause runtime errors.

When MBR=TRUE, the function expects LibQValue and LibPGQValue columns (lines 245). When MBR=FALSE, it expects GlobalQValue and GlobalPGQValue (line 247). If these columns are absent from the input (e.g., older DIANN versions or different output settings), the function will fail with an unclear error.

🛡️ Suggested defensive check
 .filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) {
   input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)
   
   if (MBR) {
+    if (!all(c("LibPGQValue", "LibQValue") %in% colnames(input))) {
+      warning("MBR Q-value columns not found, skipping MBR filtering")
+      return(input)
+    }
     dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
   } else {
+    if (!all(c("GlobalPGQValue", "GlobalQValue") %in% colnames(input))) {
+      warning("Global Q-value columns not found, skipping global filtering")
+      return(input)
+    }
     dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
   }
 }
🤖 Prompt for AI Agents
In `@R/clean_DIANN.R` around lines 241 - 249, The .filterDIANNByQValues function
fails if expected q-value columns are missing; update it to defensive-check for
required columns before filtering: inside .filterDIANNByQValues, detect whether
MBR is TRUE and then verify presence of "LibQValue" and "LibPGQValue", or if MBR
is FALSE verify "GlobalQValue" and "GlobalPGQValue"; if missing, either create
those columns filled with NA (so downstream filtering behaves predictably) or
return a clear stop()/warning indicating the missing column names and skip the
MBR-specific filter, then apply the DetectionQValue filter as currently done.
Use the function name .filterDIANNByQValues and the parameter MBR and the column
names LibQValue, LibPGQValue, GlobalQValue, GlobalPGQValue in your checks to
locate where to add this logic.


#' Finalize columns for output
#'
#' @param input Data frame
#' @return Data frame with final columns
#' @keywords internal
.finalizeDIANNColumns <- function(input) {
# Final column selection for MSstats format
msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge",
"FragmentIon", "ProductCharge", "Run", "Intensity")


# Add annotation columns if they exist
if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")
Comment thread
tonywu1999 marked this conversation as resolved.

# Add IsotopeLabelType, assuming Light for DIANN
input$IsotopeLabelType <- "L"
msstats_cols <- c(msstats_cols, "IsotopeLabelType")

final_cols <- intersect(msstats_cols, colnames(input))
dplyr::select(input, all_of(final_cols))
}

#' Write chunk to file
#'
#' @param input Data frame
#' @param output_path Path to output file
#' @param pos Chunk position
#' @return NULL
#' @keywords internal
.writeDIANNChunk <- function(input, output_path, pos) {
# Write to file
if (!is.null(pos)) {
if (pos == 1) {
readr::write_csv(input, file = output_path, append = FALSE)
} else {
readr::write_csv(input, file = output_path, append = TRUE)
}
}
}
45 changes: 44 additions & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
}


#' Convert out-of-memory DIANN files to MSstats format.
#'
#' @inheritParams MSstatsPreprocessBig
#' @param MBR True if analysis was done with match between runs.
#' @param quantificationColumn Use 'Fragment.Quant.Corrected'(default) column for quantified intensities for DIANN 1.8.x.
#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.
#'
#' @export
#'
#' @return either arrow object or sparklyr table that can be optionally collected
#' into memory by using dplyr::collect function.
#'
bigDIANNtoMSstatsFormat <- function(input_file,
output_file_name,
backend,
MBR = TRUE,
quantificationColumn = "Fragment.Quant.Corrected",
max_feature_count = 100,
filter_unique_peptides = FALSE,
aggregate_psms = FALSE,
filter_few_obs = FALSE,
remove_annotation = FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
connection = NULL) {

# Reduce and clean the DIANN report file in chunks
reduceBigDIANN(input_file,
paste0("reduce_output_", output_file_name),
MBR,
quantificationColumn)

# Preprocess the cleaned data (feature selection, etc.)
msstats_data <- MSstatsPreprocessBig(
paste0("reduce_output_", output_file_name),
output_file_name, backend, max_feature_count,
Comment thread
tonywu1999 marked this conversation as resolved.
filter_unique_peptides, aggregate_psms, filter_few_obs,
remove_annotation, calculateAnomalyScores,
anomalyModelFeatures, connection)

return(msstats_data)
}


#' Merge annotation to output of MSstatsPreprocessBig
#'
#' @param input output of MSstatsPreprocessBig
Expand Down Expand Up @@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
MSstatsAddAnnotationBig <- function(input, annotation) {
dplyr::inner_join(input, annotation, by = "Run")
}

Loading