From 1dfd8f095ee947e1008173f304ddba90a0759bd9 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 30 Dec 2025 14:49:21 -0500 Subject: [PATCH 1/8] Implementing and Testing DIANN converter for MSstatsBIG. --- DESCRIPTION | 6 +- NAMESPACE | 1 + R/clean_DIANN.R | 113 ++++++++++++++++++++++ R/converters.R | 45 ++++++++- man/bigDIANNtoMSstatsFormat.Rd | 65 +++++++++++++ tests/testthat/test-converters.R | 95 ++++++++++++++++++ tests/testthat/test-diann_converter.R | 109 +++++++++++++++++++++ tests/testthat/topN_preprocess_output.csv | 5 + tests/testthat/topN_spectro_output.csv | 3 + tests/testthat/topN_test_diann_output.csv | 5 + 10 files changed, 444 insertions(+), 3 deletions(-) create mode 100644 R/clean_DIANN.R create mode 100644 man/bigDIANNtoMSstatsFormat.Rd create mode 100644 tests/testthat/test-converters.R create mode 100644 tests/testthat/test-diann_converter.R create mode 100644 tests/testthat/topN_preprocess_output.csv create mode 100644 tests/testthat/topN_spectro_output.csv create mode 100644 tests/testthat/topN_test_diann_output.csv diff --git a/DESCRIPTION b/DESCRIPTION index ee2ee1c..e695441 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and processing larger than memory data sets. License: Artistic-2.0 Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: arrow, DBI, @@ -22,7 +22,9 @@ Imports: MSstatsConvert, readr, sparklyr, - utils + utils, + testthat, + mockery Suggests: knitr, rmarkdown diff --git a/NAMESPACE b/NAMESPACE index 6aaf83d..3c5047a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(MSstatsAddAnnotationBig) export(MSstatsPreprocessBig) +export(bigDIANNtoMSstatsFormat) export(bigFragPipetoMSstatsFormat) export(bigSpectronauttoMSstatsFormat) importFrom(MSstats,dataProcess) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R new file mode 100644 index 0000000..74c6736 --- /dev/null +++ b/R/clean_DIANN.R @@ -0,0 +1,113 @@ +#' @keywords internal +reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected") { + if (grepl("csv", input_file)) { + delim = "," + } else if (grepl("tsv|xls", input_file)) { + delim = "\t" + } else { + delim <- ";" + } + + diann_chunk <- function(x, pos) cleanDIANNChunk(x, + output_path, + MBR, + quantificationColumn, + pos) + readr::read_delim_chunked(input_file, + readr::DataFrameCallback$new(diann_chunk), + delim = delim, + chunk_size = 1e6) +} + +#' @keywords internal +cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) { + + # 1. Select required columns + base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', + 'Precursor.Charge', quantificationColumn, 'Q.Value', + 'Precursor.Mz', 'Fragment.Info', 'Run') + + mbr_cols <- if (MBR) { + c('Lib.Q.Value', 'Lib.PG.Q.Value') + } else { + c('Global.Q.Value', 'Global.PG.Q.Value') + } + + req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) + input <- dplyr::select(input, all_of(req_cols)) + + # 2. Split concatenated values (un-nest) + split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) + if (length(split_cols) > 0) { + input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";") + } + + # 3. Process fragment information + input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) + + input <- dplyr::mutate( + input, + FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), + ProductCharge = dplyr::if_else( + grepl("/", .data$Fragment.Info), + # Extract charge, default to 1 if parsing fails + as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), + 1L + ) + ) + + # 4. Clean and filter data + input <- dplyr::filter( + input, + !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]]) + ) + + # 5. Rename columns to MSstats standard + input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) + + # Standardize column names + old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', + 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', + 'PrecursorCharge', 'Intensity', 'DetectionQValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + + current_names <- colnames(input) + names_to_rename <- intersect(current_names, old_names) + + # Create a named vector for renaming in the format c(new_name = old_name) + new_names_subset <- new_names[match(names_to_rename, old_names)] + rename_map <- setNames(names_to_rename, new_names_subset) + rename_map <- rename_map[!is.na(names(rename_map))] + + input <- dplyr::rename(input, any_of(rename_map)) + + # Final column selection for MSstats format + msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", "Run", "Intensity") + + #TODO: confirm with Tony -- are these three needed? + + # Add annotation columns if they exist + if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") + if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") + + # Add IsotopeLabelType, assuming Light for DIANN + input$IsotopeLabelType <- "L" + msstats_cols <- c(msstats_cols, "IsotopeLabelType") + + final_cols <- intersect(msstats_cols, colnames(input)) + input <- dplyr::select(input, all_of(final_cols)) + + # Write to file + if (!is.null(pos)) { + if (pos == 1) { + readr::write_csv(input, file = output_path, append = FALSE) + } else { + readr::write_csv(input, file = output_path, append = TRUE) + } + } + NULL +} \ No newline at end of file diff --git a/R/converters.R b/R/converters.R index 90bfca1..f66fa50 100644 --- a/R/converters.R +++ b/R/converters.R @@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, } +#' Convert out-of-memory DIANN files to MSstats format. +#' +#' @inheritParams MSstatsPreprocessBig +#' @param MBR True if analysis was done with match between runs. +#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +#' +#' @export +#' +#' @return either arrow object or sparklyr table that can be optionally collected +#' into memory by using dplyr::collect function. +#' +bigDIANNtoMSstatsFormat <- function(input_file, + output_file_name, + backend, + MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", + max_feature_count = 100, + filter_unique_peptides = FALSE, + aggregate_psms = FALSE, + filter_few_obs = FALSE, + remove_annotation = FALSE, + calculateAnomalyScores=FALSE, + anomalyModelFeatures=c(), + connection = NULL) { + + # Reduce and clean the DIANN report file in chunks + reduceBigDIANN(input_file, + paste0("reduce_output_", output_file_name), + MBR, + quantificationColumn) + + # Preprocess the cleaned data (feature selection, etc.) + msstats_data <- MSstatsPreprocessBig( + paste0("reduce_output_", output_file_name), + output_file_name, backend, max_feature_count, + filter_unique_peptides, aggregate_psms, filter_few_obs, + remove_annotation, calculateAnomalyScores, + anomalyModelFeatures, connection) + + return(msstats_data) +} + + #' Merge annotation to output of MSstatsPreprocessBig #' #' @param input output of MSstatsPreprocessBig @@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, MSstatsAddAnnotationBig <- function(input, annotation) { dplyr::inner_join(input, annotation, by = "Run") } - diff --git a/man/bigDIANNtoMSstatsFormat.Rd b/man/bigDIANNtoMSstatsFormat.Rd new file mode 100644 index 0000000..e9d6360 --- /dev/null +++ b/man/bigDIANNtoMSstatsFormat.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/converters.R +\name{bigDIANNtoMSstatsFormat} +\alias{bigDIANNtoMSstatsFormat} +\title{Convert out-of-memory DIANN files to MSstats format.} +\usage{ +bigDIANNtoMSstatsFormat( + input_file, + output_file_name, + backend, + MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", + max_feature_count = 100, + filter_unique_peptides = FALSE, + aggregate_psms = FALSE, + filter_few_obs = FALSE, + remove_annotation = FALSE, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c(), + connection = NULL +) +} +\arguments{ +\item{input_file}{name of the input text file in 10-column MSstats format.} + +\item{output_file_name}{name of an output file which will be saved after pre-processing} + +\item{backend}{"arrow" or "sparklyr". Option "sparklyr" requires a spark installation +and connection to spark instance provided in the `connection` parameter.} + +\item{MBR}{True if analysis was done with match between runs.} + +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.} + +\item{max_feature_count}{maximum number of features per protein. Features will +be selected based on highest average intensity.} + +\item{filter_unique_peptides}{If TRUE, shared peptides will be removed. +Please refer to the `Details` section for additional information.} + +\item{aggregate_psms}{If TRUE, multiple measurements per PSM in a Run will +be aggregated (by taking maximum value). Please refer to the `Details` section for additional information.} + +\item{filter_few_obs}{If TRUE, feature with less than 3 observations across runs will be removed. +Please refer to the `Details` section for additional information.} + +\item{remove_annotation}{If TRUE, columns BioReplicate and Condition will be removed +to reduce output file size. These will need to be added manually later before +using dataProcess function. Only applicable to sparklyr backend.} + +\item{calculateAnomalyScores}{If TRUE, will carry anomaly model features through pipeline} + +\item{anomalyModelFeatures}{Character vector of column names to be carried through the pipeline} + +\item{connection}{Connection to a spark instance created with the +`spark_connect` function from `sparklyr` package.} +} +\value{ +either arrow object or sparklyr table that can be optionally collected +into memory by using dplyr::collect function. +} +\description{ +Convert out-of-memory DIANN files to MSstats format. +} diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R new file mode 100644 index 0000000..8fee265 --- /dev/null +++ b/tests/testthat/test-converters.R @@ -0,0 +1,95 @@ +library(testthat) +library(mockery) + +context("General converter functions") + +test_that("MSstatsAddAnnotationBig adds annotation correctly", { + input_data <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Intensity = c(100, 200, 300) + ) + + annotation_data <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Condition = c("A", "A", "B"), + BioReplicate = c(1, 2, 1) + ) + + expected_output <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Intensity = c(100, 200, 300), + Condition = c("A", "A", "B"), + BioReplicate = c(1, 2, 1) + ) + + result <- MSstatsAddAnnotationBig(input_data, annotation_data) + + expect_equal(result, expected_output) +}) + +test_that("MSstatsPreprocessBig performs feature selection correctly", { + input_file <- tempfile(fileext = ".csv") + output_file <- "preprocess_output.csv" + + # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity. + # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity. + msstats_data <- rbind( + data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)), + data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850)) + ) + readr::write_csv(msstats_data, input_file) + + processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow", + max_feature_count = 1) + result <- dplyr::collect(processed) + + # For P1, frag3 should be selected. For P2, fragB should be selected. + expect_equal(nrow(result), 4) + + p1_result <- result[result$ProteinName == "P1", ] + expect_equal(nrow(p1_result), 2) + expect_true(all(p1_result$FragmentIon == "frag3")) + + p2_result <- result[result$ProteinName == "P2", ] + expect_equal(nrow(p2_result), 2) + expect_true(all(p2_result$FragmentIon == "fragB")) + + # Cleanup + file.remove(input_file) + if (file.exists(output_file)) file.remove(output_file) +}) + +test_that("bigSpectronauttoMSstatsFormat works correctly", { + # Mock reduceBigSpectronaut as its source is not provided + mock_reduce <- mock(NULL) + + stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { + msstats_data <- data.frame( + ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, + FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1, + IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2), + Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher + ) + readr::write_csv(msstats_data, output_path) + }) + + input_file <- "dummy_spectro_input.csv" + output_file <- "spectro_output.csv" + + processed <- bigSpectronauttoMSstatsFormat( + input_file = input_file, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 1 + ) + result <- dplyr::collect(processed) + + # The mock reduce function creates a file with 2 features for P1. + # max_feature_count = 1 should select frag2. + expect_equal(nrow(result), 2) + expect_true(all(result$FragmentIon == "frag2")) + + # Cleanup + if (file.exists(output_file)) file.remove(output_file) + if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) +}) \ No newline at end of file diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R new file mode 100644 index 0000000..358ff49 --- /dev/null +++ b/tests/testthat/test-diann_converter.R @@ -0,0 +1,109 @@ +library(testthat) + +context("DIANN converter functions") + +# Test for the internal cleanDIANNChunk function +test_that("cleanDIANNChunk processes data correctly", { + output_file <- tempfile(fileext = ".csv") + + diann_chunk_data <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE(mod)", + Precursor.Charge = 2, + Fragment.Quant.Corrected = "100;200", + Q.Value = 0.005, + Precursor.Mz = 400.5, + Fragment.Info = "y7^1/1;b3-H2O^1/1", # One valid, one to be filtered + Lib.Q.Value = 0.01, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + # The function is not exported, so we use ::: + MSstatsBig:::cleanDIANNChunk(diann_chunk_data, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 1) + expect_equal(result$ProteinName, "ProteinA") + expect_equal(result$PeptideSequence, "PEPTIDE") + expect_equal(result$Intensity, 100) + expect_equal(result$FragmentIon, "y7^1/1") + expect_equal(result$ProductCharge, 1) + expect_equal(result$IsotopeLabelType, "L") + expect_true("PeptideModifiedSequence" %in% colnames(result)) + + file.remove(output_file) +}) + +# Test for the internal reduceBigDIANN function +test_that("reduceBigDIANN processes a file correctly", { + input_file <- tempfile(fileext = ".csv") + output_file <- tempfile(fileext = ".csv") + + diann_data <- data.frame( + Run = c("run1", "run1"), + Protein.Names = c("ProteinA", "ProteinB"), + Stripped.Sequence = c("PEPTIDE_A", "PEPTIDE_B"), + Modified.Sequence = c("PEPTIDE_A(mod)", "PEPTIDE_B"), + Precursor.Charge = c(2, 3), + Fragment.Quant.Corrected = c("100;200", "300"), + Q.Value = c(0.005, 0.006), + Precursor.Mz = c(400.5, 500.5), + Fragment.Info = c("y7^1/1;b3-H2O^1/1", "y5^1/2"), + Lib.Q.Value = c(0.01, 0.02), + Lib.PG.Q.Value = c(0.001, 0.002), + stringsAsFactors = FALSE + ) + write.csv(diann_data, input_file, row.names = FALSE) + + MSstatsBig:::reduceBigDIANN(input_file, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected") + + result <- read.csv(output_file) + expect_equal(nrow(result), 2) + expect_equal(result$Intensity, c(100, 300)) + expect_equal(result$ProteinName, c("ProteinA", "ProteinB")) + expect_equal(result$ProductCharge, c(1, 2)) + expect_equal(result$FragmentIon, c("y7^1/1", "y5^1/2")) + + file.remove(input_file) + file.remove(output_file) +}) + +# End-to-end test for bigDIANNtoMSstatsFormat +test_that("bigDIANNtoMSstatsFormat works with arrow backend", { + input_file <- tempfile(fileext = ".csv") + output_file <- "test_diann_output.csv" + + # 4 features for one protein. Feature selection should pick the top 2. + diann_data <- rbind( + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01) + ) + write.csv(diann_data, input_file, row.names = FALSE) + + converted <- bigDIANNtoMSstatsFormat( + input_file = input_file, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 2 + ) + result <- dplyr::collect(converted) + + # Avg intensities: y1=1050, y2=550, y3=100, y4=2050. + # Top 2 features are y4 and y1. + expect_equal(nrow(result), 4) # 2 features * 2 runs + expect_true(all(c("y1", "y4") %in% unique(result$FragmentIon))) + expect_false(any(c("y2", "y3") %in% unique(result$FragmentIon))) + + # Cleanup + file.remove(input_file) + if (file.exists(output_file)) file.remove(output_file) + if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) +}) \ No newline at end of file diff --git a/tests/testthat/topN_preprocess_output.csv b/tests/testthat/topN_preprocess_output.csv new file mode 100644 index 0000000..d9afa98 --- /dev/null +++ b/tests/testthat/topN_preprocess_output.csv @@ -0,0 +1,5 @@ +"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity" +"P1","PEPTIDE",2,"frag3",1,"L","A",1,"run1",2000 +"P1","PEPTIDE",2,"frag3",1,"L","A",2,"run2",2100 +"P2","PEPTIDE2",3,"fragB",1,"L","B",1,"run1",800 +"P2","PEPTIDE2",3,"fragB",1,"L","B",2,"run2",850 diff --git a/tests/testthat/topN_spectro_output.csv b/tests/testthat/topN_spectro_output.csv new file mode 100644 index 0000000..b103ba2 --- /dev/null +++ b/tests/testthat/topN_spectro_output.csv @@ -0,0 +1,3 @@ +"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity" +"P1","PEPTIDE",2,"frag2",1,"L","A",1,"run1",2000 +"P1","PEPTIDE",2,"frag2",1,"L","A",2,"run2",2100 diff --git a/tests/testthat/topN_test_diann_output.csv b/tests/testthat/topN_test_diann_output.csv new file mode 100644 index 0000000..95d41b3 --- /dev/null +++ b/tests/testthat/topN_test_diann_output.csv @@ -0,0 +1,5 @@ +"ProteinName","PeptideSequence","PeptideModifiedSequence","PrecursorCharge","FragmentIon","ProductCharge","Run","Intensity","IsotopeLabelType" +"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r1",1000,"L" +"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r2",1100,"L" +"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r1",2000,"L" +"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r2",2100,"L" From 005ced91c12b33fcda8e18311414dad937320465 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Mon, 19 Jan 2026 22:38:37 -0600 Subject: [PATCH 2/8] Responding to comments --- DESCRIPTION | 6 +++--- R/clean_DIANN.R | 29 +++++++++++++++++++++++---- R/converters.R | 2 +- tests/testthat/test-diann_converter.R | 12 +++++------ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e695441..d71d7da 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,10 +22,10 @@ Imports: MSstatsConvert, readr, sparklyr, - utils, + utils +Suggests: testthat, - mockery -Suggests: + mockery, knitr, rmarkdown VignetteBuilder: knitr diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 74c6736..260fc6b 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -1,6 +1,9 @@ #' @keywords internal reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected") { + quantificationColumn = "FragmentQuantCorrected", + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01) { if (grepl("csv", input_file)) { delim = "," } else if (grepl("tsv|xls", input_file)) { @@ -13,7 +16,10 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, output_path, MBR, quantificationColumn, - pos) + pos, + global_qvalue_cutoff, + qvalue_cutoff, + pg_qvalue_cutoff) readr::read_delim_chunked(input_file, readr::DataFrameCallback$new(diann_chunk), delim = delim, @@ -21,7 +27,10 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, } #' @keywords internal -cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) { +cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01) { # 1. Select required columns base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', @@ -44,14 +53,18 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) { } # 3. Process fragment information + + #Convert Intensity to Numeric from Char strings input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) input <- dplyr::mutate( input, FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), + + # Extract product charge ProductCharge = dplyr::if_else( grepl("/", .data$Fragment.Info), - # Extract charge, default to 1 if parsing fails + # Extract charge (number right after "/" in string), default to 1 if parsing fails as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), 1L ) @@ -83,6 +96,14 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) { rename_map <- rename_map[!is.na(names(rename_map))] input <- dplyr::rename(input, any_of(rename_map)) + # Filter by Q-values + input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) + + if (MBR) { + input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) + } else { + input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) + } # Final column selection for MSstats format msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", diff --git a/R/converters.R b/R/converters.R index f66fa50..a96fead 100644 --- a/R/converters.R +++ b/R/converters.R @@ -159,7 +159,7 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, #' #' @inheritParams MSstatsPreprocessBig #' @param MBR True if analysis was done with match between runs. -#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +#' @param quantificationColumn Use 'Fragment.Quant.Corrected'(default) column for quantified intensities for DIANN 1.8.x. #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. #' #' @export diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R index 358ff49..ead17f7 100644 --- a/tests/testthat/test-diann_converter.R +++ b/tests/testthat/test-diann_converter.R @@ -16,7 +16,7 @@ test_that("cleanDIANNChunk processes data correctly", { Q.Value = 0.005, Precursor.Mz = 400.5, Fragment.Info = "y7^1/1;b3-H2O^1/1", # One valid, one to be filtered - Lib.Q.Value = 0.01, + Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001, stringsAsFactors = FALSE ) @@ -54,7 +54,7 @@ test_that("reduceBigDIANN processes a file correctly", { Q.Value = c(0.005, 0.006), Precursor.Mz = c(400.5, 500.5), Fragment.Info = c("y7^1/1;b3-H2O^1/1", "y5^1/2"), - Lib.Q.Value = c(0.01, 0.02), + Lib.Q.Value = c(0.001, 0.002), Lib.PG.Q.Value = c(0.001, 0.002), stringsAsFactors = FALSE ) @@ -81,10 +81,10 @@ test_that("bigDIANNtoMSstatsFormat works with arrow backend", { # 4 features for one protein. Feature selection should pick the top 2. diann_data <- rbind( - data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), - data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), - data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01), - data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01) + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001) ) write.csv(diann_data, input_file, row.names = FALSE) From d8820919f4be8e3b2286c50faabc6137f0d7c11d Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 20 Jan 2026 00:01:51 -0600 Subject: [PATCH 3/8] Adding handling for DIANN 2.0 --- R/clean_DIANN.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 260fc6b..e8f8457 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -31,6 +31,15 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, pg_qvalue_cutoff = 0.01) { + # 0. Handle parquet files if needed + if (quantificationColumn == "auto") { + fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE) + if (length(fragment_columns) == 0) { + stop("No fragment quantification columns found. Please check your input.") + } + input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";") + quantificationColumn <- "FragmentQuantCorrected" + } # 1. Select required columns base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', From 6728b7c384572d1b37435ec804628fd68d28b2a7 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 20 Jan 2026 00:02:22 -0600 Subject: [PATCH 4/8] Adding testes for DIANN 2.0 --- tests/testthat/test-diann_converter.R | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R index ead17f7..f4f0c98 100644 --- a/tests/testthat/test-diann_converter.R +++ b/tests/testthat/test-diann_converter.R @@ -39,6 +39,44 @@ test_that("cleanDIANNChunk processes data correctly", { file.remove(output_file) }) +test_that("cleanDIANNChunk handles 'auto' quantification column correctly", { + output_file <- tempfile(fileext = ".csv") + + # Data with wide format fragment quantification + diann_chunk_wide <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE", + Precursor.Charge = 2, + Fr1Quantity = 100, + Fr2Quantity = 200, + Q.Value = 0.005, + Precursor.Mz = 400.5, + Fragment.Info = "y1^1/1;y2^1/1", + Lib.Q.Value = 0.001, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + MSstatsBig:::cleanDIANNChunk(diann_chunk_wide, output_file, MBR = TRUE, + quantificationColumn = "auto", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 2) + expect_equal(sort(result$Intensity), c(100, 200)) + expect_equal(sort(result$FragmentIon), c("y1^1/1", "y2^1/1")) + + file.remove(output_file) + + # Test error when columns are missing + diann_chunk_missing <- diann_chunk_wide[, !grepl("Quantity", names(diann_chunk_wide))] + expect_error(MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE, + quantificationColumn = "auto", pos = 1), + "No fragment quantification columns found") +}) + # Test for the internal reduceBigDIANN function test_that("reduceBigDIANN processes a file correctly", { input_file <- tempfile(fileext = ".csv") From a041a72edca8e3d99ab9f88e978cb7fbb0e81ee1 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 20 Jan 2026 00:07:46 -0600 Subject: [PATCH 5/8] Making the code more readable, and easier to follow --- R/clean_DIANN.R | 112 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 29 deletions(-) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index e8f8457..d03ae2a 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -31,7 +31,40 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, pg_qvalue_cutoff = 0.01) { - # 0. Handle parquet files if needed + # 1. Handle "auto" quantification column + processed <- .handleAutoQuantification(input, quantificationColumn) + input <- processed$input + quantificationColumn <- processed$quantificationColumn + + # 2. Select required columns + input <- .selectDIANNColumns(input, MBR, quantificationColumn) + + # 3. Expand concatenated rows + input <- .expandDIANNRows(input, quantificationColumn) + + # 4. Process fragment info (extract intensity, charge, ion) + input <- .processDIANNFragmentInfo(input, quantificationColumn) + + # 5. Filter invalid fragments + input <- .filterDIANNFragments(input, quantificationColumn) + + # 6. Standardize column names + input <- .standardizeDIANNColumns(input, quantificationColumn) + + # 7. Filter by Q-values + input <- .filterDIANNByQValues(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) + + # 8. Finalize columns (select final set, add IsotopeLabelType) + input <- .finalizeDIANNColumns(input) + + # 9. Write to output + .writeDIANNChunk(input, output_path, pos) + + NULL +} + +#' @keywords internal +.handleAutoQuantification <- function(input, quantificationColumn) { if (quantificationColumn == "auto") { fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE) if (length(fragment_columns) == 0) { @@ -40,8 +73,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";") quantificationColumn <- "FragmentQuantCorrected" } - - # 1. Select required columns + list(input = input, quantificationColumn = quantificationColumn) +} + +#' @keywords internal +.selectDIANNColumns <- function(input, MBR, quantificationColumn) { base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 'Precursor.Charge', quantificationColumn, 'Q.Value', 'Precursor.Mz', 'Fragment.Info', 'Run') @@ -53,23 +89,28 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, } req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) - input <- dplyr::select(input, all_of(req_cols)) - - # 2. Split concatenated values (un-nest) + dplyr::select(input, all_of(req_cols)) +} + +#' @keywords internal +.expandDIANNRows <- function(input, quantificationColumn) { split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) if (length(split_cols) > 0) { - input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";") + tidyr::separate_rows(input, all_of(split_cols), sep = ";") + } else { + input } - - # 3. Process fragment information +} - #Convert Intensity to Numeric from Char strings +#' @keywords internal +.processDIANNFragmentInfo <- function(input, quantificationColumn) { + # Convert Intensity to Numeric from Char strings input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) - input <- dplyr::mutate( + dplyr::mutate( input, FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), - + # Extract product charge ProductCharge = dplyr::if_else( grepl("/", .data$Fragment.Info), @@ -78,19 +119,24 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, 1L ) ) - - # 4. Clean and filter data - input <- dplyr::filter( +} + +#' @keywords internal +.filterDIANNFragments <- function(input, quantificationColumn) { + dplyr::filter( input, !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]]) ) - - # 5. Rename columns to MSstats standard +} + +#' @keywords internal +.standardizeDIANNColumns <- function(input, quantificationColumn) { input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) # Standardize column names + clean_quant_col <- gsub("\\.", "", quantificationColumn) old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', - 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue', + 'PrecursorCharge', clean_quant_col, 'QValue', 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', 'PrecursorCharge', 'Intensity', 'DetectionQValue', @@ -98,39 +144,48 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, current_names <- colnames(input) names_to_rename <- intersect(current_names, old_names) - + # Create a named vector for renaming in the format c(new_name = old_name) new_names_subset <- new_names[match(names_to_rename, old_names)] rename_map <- setNames(names_to_rename, new_names_subset) rename_map <- rename_map[!is.na(names(rename_map))] + + dplyr::rename(input, any_of(rename_map)) +} - input <- dplyr::rename(input, any_of(rename_map)) - # Filter by Q-values +#' @keywords internal +.filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) { input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) if (MBR) { - input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) + dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) } else { - input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) + dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) } - +} + +#' @keywords internal +.finalizeDIANNColumns <- function(input) { # Final column selection for MSstats format msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "Run", "Intensity") - #TODO: confirm with Tony -- are these three needed? + # TODO: confirm with Tony -- are these three needed? # Add annotation columns if they exist if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") - + # Add IsotopeLabelType, assuming Light for DIANN input$IsotopeLabelType <- "L" msstats_cols <- c(msstats_cols, "IsotopeLabelType") final_cols <- intersect(msstats_cols, colnames(input)) - input <- dplyr::select(input, all_of(final_cols)) - + dplyr::select(input, all_of(final_cols)) +} + +#' @keywords internal +.writeDIANNChunk <- function(input, output_path, pos) { # Write to file if (!is.null(pos)) { if (pos == 1) { @@ -139,5 +194,4 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, readr::write_csv(input, file = output_path, append = TRUE) } } - NULL } \ No newline at end of file From 43e68b2ea130bd2fa2153be6313c08c904de5ca9 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 20 Jan 2026 01:13:22 -0600 Subject: [PATCH 6/8] Adding missing columns code and a test to make sure that when flagment.ion is NA, ProductCharge is set to 1 --- R/clean_DIANN.R | 92 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index d03ae2a..5d85389 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -1,3 +1,13 @@ +#' Read and clean a large DIANN file in chunks +#' +#' @param input_file Path to the input DIANN file +#' @param output_path Path to the output CSV file +#' @param MBR Boolean, whether MBR was used +#' @param quantificationColumn Name of the column containing intensity values +#' @param global_qvalue_cutoff Global Q-value cutoff +#' @param qvalue_cutoff Q-value cutoff +#' @param pg_qvalue_cutoff Protein group Q-value cutoff +#' @return NULL. Writes to file. #' @keywords internal reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, quantificationColumn = "FragmentQuantCorrected", @@ -26,6 +36,17 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, chunk_size = 1e6) } +#' Clean a single chunk of DIANN data +#' +#' @param input Data frame chunk +#' @param output_path Path to output file +#' @param MBR Boolean, whether MBR was used +#' @param quantificationColumn Name of intensity column +#' @param pos Chunk position (1 for first chunk, >1 for subsequent) +#' @param global_qvalue_cutoff Global Q-value cutoff +#' @param qvalue_cutoff Q-value cutoff +#' @param pg_qvalue_cutoff Protein group Q-value cutoff +#' @return NULL #' @keywords internal cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, global_qvalue_cutoff = 0.01, @@ -38,6 +59,7 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, # 2. Select required columns input <- .selectDIANNColumns(input, MBR, quantificationColumn) + input <- .cleanDIANNAddMissingColumns(input) # 3. Expand concatenated rows input <- .expandDIANNRows(input, quantificationColumn) @@ -63,6 +85,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, NULL } +#' Handle automatic detection of quantification columns +#' +#' @param input Data frame +#' @param quantificationColumn Name of column or "auto" +#' @return List with input data frame and updated quantification column name #' @keywords internal .handleAutoQuantification <- function(input, quantificationColumn) { if (quantificationColumn == "auto") { @@ -76,6 +103,12 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, list(input = input, quantificationColumn = quantificationColumn) } +#' Select required columns from DIANN output +#' +#' @param input Data frame +#' @param MBR Boolean +#' @param quantificationColumn Name of intensity column +#' @return Data frame with selected columns #' @keywords internal .selectDIANNColumns <- function(input, MBR, quantificationColumn) { base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', @@ -92,6 +125,26 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, dplyr::select(input, all_of(req_cols)) } +#' Add missing required columns +#' +#' @param input Data frame +#' @return Data frame with missing columns added +#' @keywords internal +.cleanDIANNAddMissingColumns <- function(input) { + if (!"Precursor.Mz" %in% colnames(input)) { + input <- dplyr::mutate(input, Precursor.Mz = NA) + } + if (!"Fragment.Info" %in% colnames(input)) { + input <- dplyr::mutate(input, Fragment.Info = NA) + } + input +} + +#' Expand rows with multiple fragments +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with expanded rows #' @keywords internal .expandDIANNRows <- function(input, quantificationColumn) { split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) @@ -102,6 +155,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, } } +#' Process fragment information strings +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with FragmentIon and ProductCharge columns #' @keywords internal .processDIANNFragmentInfo <- function(input, quantificationColumn) { # Convert Intensity to Numeric from Char strings @@ -116,19 +174,30 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, grepl("/", .data$Fragment.Info), # Extract charge (number right after "/" in string), default to 1 if parsing fails as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), - 1L + 1L, + missing = 1L ) ) } +#' Filter invalid fragments +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Filtered data frame #' @keywords internal .filterDIANNFragments <- function(input, quantificationColumn) { dplyr::filter( input, - !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]]) + (!grepl("NH3|H2O", .data$FragmentIon) | is.na(.data$FragmentIon)) & !is.na(.data[[quantificationColumn]]) ) } +#' Standardize column names to MSstats format +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with renamed columns #' @keywords internal .standardizeDIANNColumns <- function(input, quantificationColumn) { input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) @@ -153,6 +222,14 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, dplyr::rename(input, any_of(rename_map)) } +#' Filter data by Q-values +#' +#' @param input Data frame +#' @param MBR Boolean +#' @param global_qvalue_cutoff Numeric +#' @param qvalue_cutoff Numeric +#' @param pg_qvalue_cutoff Numeric +#' @return Filtered data frame #' @keywords internal .filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) { input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) @@ -164,13 +241,16 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, } } +#' Finalize columns for output +#' +#' @param input Data frame +#' @return Data frame with final columns #' @keywords internal .finalizeDIANNColumns <- function(input) { # Final column selection for MSstats format msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "Run", "Intensity") - # TODO: confirm with Tony -- are these three needed? # Add annotation columns if they exist if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") @@ -184,6 +264,12 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, dplyr::select(input, all_of(final_cols)) } +#' Write chunk to file +#' +#' @param input Data frame +#' @param output_path Path to output file +#' @param pos Chunk position +#' @return NULL #' @keywords internal .writeDIANNChunk <- function(input, output_path, pos) { # Write to file From d3772c9592943aaaf45e5355ea9429c710ea8304 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 20 Jan 2026 01:14:18 -0600 Subject: [PATCH 7/8] Test to make sure product charge is set to one when fragment.ion column is not present --- tests/testthat/test-diann_converter.R | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R index f4f0c98..c434cb5 100644 --- a/tests/testthat/test-diann_converter.R +++ b/tests/testthat/test-diann_converter.R @@ -77,6 +77,37 @@ test_that("cleanDIANNChunk handles 'auto' quantification column correctly", { "No fragment quantification columns found") }) +test_that("cleanDIANNChunk handles missing Fragment.Info by defaulting ProductCharge to 1", { + output_file <- tempfile(fileext = ".csv") + + # Data with missing Fragment.Info (simulating it not being present) + diann_chunk_missing <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE", + Precursor.Charge = 2, + Fragment.Quant.Corrected = 100, + Q.Value = 0.005, + Precursor.Mz = 400.5, + # Fragment.Info is missing + Lib.Q.Value = 0.001, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 1) + expect_equal(result$ProductCharge, 1) + expect_true(is.na(result$FragmentIon)) + + file.remove(output_file) +}) + # Test for the internal reduceBigDIANN function test_that("reduceBigDIANN processes a file correctly", { input_file <- tempfile(fileext = ".csv") From bfbcf7ce99c72da9500d109ec1a4e3b8755fef60 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 30 Jan 2026 11:46:33 -0600 Subject: [PATCH 8/8] Addressing comments --- R/clean_DIANN.R | 9 ++++++++- tests/testthat/test-diann_converter.R | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R index 5d85389..6bd867d 100644 --- a/R/clean_DIANN.R +++ b/R/clean_DIANN.R @@ -10,7 +10,7 @@ #' @return NULL. Writes to file. #' @keywords internal reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, - quantificationColumn = "FragmentQuantCorrected", + quantificationColumn = "Fragment.Quant.Corrected", global_qvalue_cutoff = 0.01, qvalue_cutoff = 0.01, pg_qvalue_cutoff = 0.01) { @@ -165,6 +165,13 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, # Convert Intensity to Numeric from Char strings input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) + # Generate fragment info if missing + if (all(is.na(input$Fragment.Info))) { + input <- dplyr::group_by(input, Protein.Names, Modified.Sequence, Precursor.Charge, Run) + input <- dplyr::mutate(input, Fragment.Info = paste0("Frag", dplyr::row_number())) + input <- dplyr::ungroup(input) + } + dplyr::mutate( input, FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R index c434cb5..81b82cf 100644 --- a/tests/testthat/test-diann_converter.R +++ b/tests/testthat/test-diann_converter.R @@ -103,7 +103,7 @@ test_that("cleanDIANNChunk handles missing Fragment.Info by defaulting ProductCh expect_equal(nrow(result), 1) expect_equal(result$ProductCharge, 1) - expect_true(is.na(result$FragmentIon)) + expect_equal(result$FragmentIon, "Frag1") file.remove(output_file) }) @@ -146,7 +146,7 @@ test_that("reduceBigDIANN processes a file correctly", { # End-to-end test for bigDIANNtoMSstatsFormat test_that("bigDIANNtoMSstatsFormat works with arrow backend", { input_file <- tempfile(fileext = ".csv") - output_file <- "test_diann_output.csv" + output_file <- basename(tempfile(fileext = ".csv")) # 4 features for one protein. Feature selection should pick the top 2. diann_data <- rbind(