-
Notifications
You must be signed in to change notification settings - Fork 1
Implementing and Testing DIANN converter for MSstatsBIG. #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1dfd8f0
005ced9
d882091
6728b7c
a041a72
43e68b2
d3772c9
bfbcf7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,290 @@ | ||
| #' Read and clean a large DIANN file in chunks | ||
| #' | ||
| #' @param input_file Path to the input DIANN file | ||
| #' @param output_path Path to the output CSV file | ||
| #' @param MBR Boolean, whether MBR was used | ||
| #' @param quantificationColumn Name of the column containing intensity values | ||
| #' @param global_qvalue_cutoff Global Q-value cutoff | ||
| #' @param qvalue_cutoff Q-value cutoff | ||
| #' @param pg_qvalue_cutoff Protein group Q-value cutoff | ||
| #' @return NULL. Writes to file. | ||
| #' @keywords internal | ||
| reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, | ||
| quantificationColumn = "Fragment.Quant.Corrected", | ||
| global_qvalue_cutoff = 0.01, | ||
| qvalue_cutoff = 0.01, | ||
| pg_qvalue_cutoff = 0.01) { | ||
| if (grepl("csv", input_file)) { | ||
| delim = "," | ||
| } else if (grepl("tsv|xls", input_file)) { | ||
| delim = "\t" | ||
| } else { | ||
| delim <- ";" | ||
| } | ||
|
|
||
| diann_chunk <- function(x, pos) cleanDIANNChunk(x, | ||
| output_path, | ||
| MBR, | ||
| quantificationColumn, | ||
| pos, | ||
| global_qvalue_cutoff, | ||
| qvalue_cutoff, | ||
| pg_qvalue_cutoff) | ||
| readr::read_delim_chunked(input_file, | ||
|
tonywu1999 marked this conversation as resolved.
|
||
| readr::DataFrameCallback$new(diann_chunk), | ||
| delim = delim, | ||
| chunk_size = 1e6) | ||
| } | ||
|
|
||
| #' Clean a single chunk of DIANN data | ||
| #' | ||
| #' @param input Data frame chunk | ||
| #' @param output_path Path to output file | ||
| #' @param MBR Boolean, whether MBR was used | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @param pos Chunk position (1 for first chunk, >1 for subsequent) | ||
| #' @param global_qvalue_cutoff Global Q-value cutoff | ||
| #' @param qvalue_cutoff Q-value cutoff | ||
| #' @param pg_qvalue_cutoff Protein group Q-value cutoff | ||
| #' @return NULL | ||
| #' @keywords internal | ||
| cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, | ||
| global_qvalue_cutoff = 0.01, | ||
| qvalue_cutoff = 0.01, | ||
| pg_qvalue_cutoff = 0.01) { | ||
| # 1. Handle "auto" quantification column | ||
| processed <- .handleAutoQuantification(input, quantificationColumn) | ||
| input <- processed$input | ||
| quantificationColumn <- processed$quantificationColumn | ||
|
|
||
| # 2. Select required columns | ||
| input <- .selectDIANNColumns(input, MBR, quantificationColumn) | ||
| input <- .cleanDIANNAddMissingColumns(input) | ||
|
|
||
| # 3. Expand concatenated rows | ||
| input <- .expandDIANNRows(input, quantificationColumn) | ||
|
|
||
| # 4. Process fragment info (extract intensity, charge, ion) | ||
| input <- .processDIANNFragmentInfo(input, quantificationColumn) | ||
|
|
||
| # 5. Filter invalid fragments | ||
| input <- .filterDIANNFragments(input, quantificationColumn) | ||
|
|
||
| # 6. Standardize column names | ||
| input <- .standardizeDIANNColumns(input, quantificationColumn) | ||
|
|
||
| # 7. Filter by Q-values | ||
| input <- .filterDIANNByQValues(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) | ||
|
|
||
| # 8. Finalize columns (select final set, add IsotopeLabelType) | ||
| input <- .finalizeDIANNColumns(input) | ||
|
|
||
| # 9. Write to output | ||
| .writeDIANNChunk(input, output_path, pos) | ||
|
|
||
| NULL | ||
| } | ||
|
|
||
| #' Handle automatic detection of quantification columns | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param quantificationColumn Name of column or "auto" | ||
| #' @return List with input data frame and updated quantification column name | ||
| #' @keywords internal | ||
| .handleAutoQuantification <- function(input, quantificationColumn) { | ||
| if (quantificationColumn == "auto") { | ||
| fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE) | ||
| if (length(fragment_columns) == 0) { | ||
| stop("No fragment quantification columns found. Please check your input.") | ||
| } | ||
| input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";") | ||
| quantificationColumn <- "FragmentQuantCorrected" | ||
| } | ||
| list(input = input, quantificationColumn = quantificationColumn) | ||
| } | ||
|
|
||
| #' Select required columns from DIANN output | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param MBR Boolean | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @return Data frame with selected columns | ||
| #' @keywords internal | ||
| .selectDIANNColumns <- function(input, MBR, quantificationColumn) { | ||
| base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', | ||
| 'Precursor.Charge', quantificationColumn, 'Q.Value', | ||
| 'Precursor.Mz', 'Fragment.Info', 'Run') | ||
|
|
||
| mbr_cols <- if (MBR) { | ||
| c('Lib.Q.Value', 'Lib.PG.Q.Value') | ||
| } else { | ||
| c('Global.Q.Value', 'Global.PG.Q.Value') | ||
| } | ||
|
tonywu1999 marked this conversation as resolved.
|
||
|
|
||
| req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) | ||
| dplyr::select(input, all_of(req_cols)) | ||
| } | ||
|
|
||
| #' Add missing required columns | ||
| #' | ||
| #' @param input Data frame | ||
| #' @return Data frame with missing columns added | ||
| #' @keywords internal | ||
| .cleanDIANNAddMissingColumns <- function(input) { | ||
| if (!"Precursor.Mz" %in% colnames(input)) { | ||
| input <- dplyr::mutate(input, Precursor.Mz = NA) | ||
| } | ||
| if (!"Fragment.Info" %in% colnames(input)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://github.com/Vitek-Lab/MSstatsConvert/blob/devel/R/clean_DIANN.R#L115-L118 : Instead, it should be like Frag1, Frag2, etc. |
||
| input <- dplyr::mutate(input, Fragment.Info = NA) | ||
| } | ||
| input | ||
| } | ||
|
|
||
| #' Expand rows with multiple fragments | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @return Data frame with expanded rows | ||
| #' @keywords internal | ||
| .expandDIANNRows <- function(input, quantificationColumn) { | ||
| split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) | ||
|
tonywu1999 marked this conversation as resolved.
|
||
| if (length(split_cols) > 0) { | ||
| tidyr::separate_rows(input, all_of(split_cols), sep = ";") | ||
| } else { | ||
| input | ||
| } | ||
| } | ||
|
|
||
| #' Process fragment information strings | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @return Data frame with FragmentIon and ProductCharge columns | ||
| #' @keywords internal | ||
| .processDIANNFragmentInfo <- function(input, quantificationColumn) { | ||
| # Convert Intensity to Numeric from Char strings | ||
| input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) | ||
|
|
||
| # Generate fragment info if missing | ||
| if (all(is.na(input$Fragment.Info))) { | ||
| input <- dplyr::group_by(input, Protein.Names, Modified.Sequence, Precursor.Charge, Run) | ||
| input <- dplyr::mutate(input, Fragment.Info = paste0("Frag", dplyr::row_number())) | ||
| input <- dplyr::ungroup(input) | ||
| } | ||
|
|
||
| dplyr::mutate( | ||
| input, | ||
| FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), | ||
|
|
||
| # Extract product charge | ||
| ProductCharge = dplyr::if_else( | ||
| grepl("/", .data$Fragment.Info), | ||
| # Extract charge (number right after "/" in string), default to 1 if parsing fails | ||
| as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), | ||
| 1L, | ||
| missing = 1L | ||
| ) | ||
|
Comment on lines
+179
to
+186
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: find . -name "clean_DIANN.R" -type fRepository: Vitek-Lab/MSstatsBig Length of output: 81 🏁 Script executed: sed -n '64,70p' ./R/clean_DIANN.RRepository: Vitek-Lab/MSstatsBig Length of output: 348 🏁 Script executed: python3 << 'EOF'
# Let's verify the behavior of stringr::str_extract and as.integer in R
# by examining what happens when the regex pattern doesn't match
# The behavior should be:
# stringr::str_extract("something/", "(?<=/)[0-9]+") -> NA (no digits after /)
# as.integer(NA) -> NA (not 1L)
# This is a R behavior question, let me create a test to understand it better
print("Testing R string extraction and type conversion behavior:")
print("Case 1: Fragment.Info with / but no digits after it")
print(" stringr::str_extract returns: NA (pattern doesn't match)")
print(" as.integer(NA) returns: NA (not 1L)")
print("")
print("Case 2: Fragment.Info with / and digits after it")
print(" stringr::str_extract returns: the digits")
print(" as.integer(digits) returns: integer value")
print("")
print("Case 3: Fragment.Info without /")
print(" Current code returns: 1L (via the else branch of if_else)")
print("")
print("So the bug is in Case 1 - when / is present but no digits follow")
print("The function would return NA instead of 1L")
EOFRepository: Vitek-Lab/MSstatsBig Length of output: 150 🏁 Script executed: grep -r "ProductCharge" ./R --include="*.R" -B 2 -A 2Repository: Vitek-Lab/MSstatsBig Length of output: 4903 🏁 Script executed: sed -n '50,80p' ./R/clean_DIANN.RRepository: Vitek-Lab/MSstatsBig Length of output: 1088 Ensure ProductCharge defaults to 1 when parsing fails. The current code returns 🔧 Suggested change- ProductCharge = dplyr::if_else(
- grepl("/", .data$Fragment.Info),
- # Extract charge (number right after "/" in string), default to 1 if parsing fails
- as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
- 1L
- )
+ ProductCharge = dplyr::if_else(
+ grepl("/", .data$Fragment.Info),
+ dplyr::coalesce(
+ as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+ 1L
+ ),
+ 1L
+ )🤖 Prompt for AI Agents |
||
| ) | ||
| } | ||
|
|
||
| #' Filter invalid fragments | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @return Filtered data frame | ||
| #' @keywords internal | ||
| .filterDIANNFragments <- function(input, quantificationColumn) { | ||
| dplyr::filter( | ||
| input, | ||
| (!grepl("NH3|H2O", .data$FragmentIon) | is.na(.data$FragmentIon)) & !is.na(.data[[quantificationColumn]]) | ||
| ) | ||
| } | ||
|
|
||
| #' Standardize column names to MSstats format | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param quantificationColumn Name of intensity column | ||
| #' @return Data frame with renamed columns | ||
| #' @keywords internal | ||
| .standardizeDIANNColumns <- function(input, quantificationColumn) { | ||
| input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) | ||
|
|
||
| # Standardize column names | ||
| clean_quant_col <- gsub("\\.", "", quantificationColumn) | ||
| old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', | ||
| 'PrecursorCharge', clean_quant_col, 'QValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
| new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', | ||
| 'PrecursorCharge', 'Intensity', 'DetectionQValue', | ||
| 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') | ||
|
|
||
| current_names <- colnames(input) | ||
| names_to_rename <- intersect(current_names, old_names) | ||
|
|
||
| # Create a named vector for renaming in the format c(new_name = old_name) | ||
| new_names_subset <- new_names[match(names_to_rename, old_names)] | ||
| rename_map <- setNames(names_to_rename, new_names_subset) | ||
| rename_map <- rename_map[!is.na(names(rename_map))] | ||
|
|
||
| dplyr::rename(input, any_of(rename_map)) | ||
| } | ||
|
|
||
| #' Filter data by Q-values | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param MBR Boolean | ||
| #' @param global_qvalue_cutoff Numeric | ||
| #' @param qvalue_cutoff Numeric | ||
| #' @param pg_qvalue_cutoff Numeric | ||
| #' @return Filtered data frame | ||
| #' @keywords internal | ||
| .filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) { | ||
| input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) | ||
|
|
||
| if (MBR) { | ||
| dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) | ||
| } else { | ||
| dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) | ||
| } | ||
| } | ||
|
Comment on lines
+241
to
+249
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing Q-value columns will cause runtime errors. When 🛡️ Suggested defensive check .filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) {
input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)
if (MBR) {
+ if (!all(c("LibPGQValue", "LibQValue") %in% colnames(input))) {
+ warning("MBR Q-value columns not found, skipping MBR filtering")
+ return(input)
+ }
dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
} else {
+ if (!all(c("GlobalPGQValue", "GlobalQValue") %in% colnames(input))) {
+ warning("Global Q-value columns not found, skipping global filtering")
+ return(input)
+ }
dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
}
}🤖 Prompt for AI Agents |
||
|
|
||
| #' Finalize columns for output | ||
| #' | ||
| #' @param input Data frame | ||
| #' @return Data frame with final columns | ||
| #' @keywords internal | ||
| .finalizeDIANNColumns <- function(input) { | ||
| # Final column selection for MSstats format | ||
| msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", | ||
| "FragmentIon", "ProductCharge", "Run", "Intensity") | ||
|
|
||
|
|
||
| # Add annotation columns if they exist | ||
| if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") | ||
| if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") | ||
|
tonywu1999 marked this conversation as resolved.
|
||
|
|
||
| # Add IsotopeLabelType, assuming Light for DIANN | ||
| input$IsotopeLabelType <- "L" | ||
| msstats_cols <- c(msstats_cols, "IsotopeLabelType") | ||
|
|
||
| final_cols <- intersect(msstats_cols, colnames(input)) | ||
| dplyr::select(input, all_of(final_cols)) | ||
| } | ||
|
|
||
| #' Write chunk to file | ||
| #' | ||
| #' @param input Data frame | ||
| #' @param output_path Path to output file | ||
| #' @param pos Chunk position | ||
| #' @return NULL | ||
| #' @keywords internal | ||
| .writeDIANNChunk <- function(input, output_path, pos) { | ||
| # Write to file | ||
| if (!is.null(pos)) { | ||
| if (pos == 1) { | ||
| readr::write_csv(input, file = output_path, append = FALSE) | ||
| } else { | ||
| readr::write_csv(input, file = output_path, append = TRUE) | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.