diff --git a/DESCRIPTION b/DESCRIPTION index ee2ee1c..d71d7da 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and processing larger than memory data sets. License: Artistic-2.0 Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: arrow, DBI, @@ -23,7 +23,9 @@ Imports: readr, sparklyr, utils -Suggests: +Suggests: + testthat, + mockery, knitr, rmarkdown VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 6aaf83d..3c5047a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(MSstatsAddAnnotationBig) export(MSstatsPreprocessBig) +export(bigDIANNtoMSstatsFormat) export(bigFragPipetoMSstatsFormat) export(bigSpectronauttoMSstatsFormat) importFrom(MSstats,dataProcess) diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R new file mode 100644 index 0000000..6bd867d --- /dev/null +++ b/R/clean_DIANN.R @@ -0,0 +1,290 @@ +#' Read and clean a large DIANN file in chunks +#' +#' @param input_file Path to the input DIANN file +#' @param output_path Path to the output CSV file +#' @param MBR Boolean, whether MBR was used +#' @param quantificationColumn Name of the column containing intensity values +#' @param global_qvalue_cutoff Global Q-value cutoff +#' @param qvalue_cutoff Q-value cutoff +#' @param pg_qvalue_cutoff Protein group Q-value cutoff +#' @return NULL. Writes to file. +#' @keywords internal +reduceBigDIANN <- function(input_file, output_path, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01) { + if (grepl("csv", input_file)) { + delim = "," + } else if (grepl("tsv|xls", input_file)) { + delim = "\t" + } else { + delim <- ";" + } + + diann_chunk <- function(x, pos) cleanDIANNChunk(x, + output_path, + MBR, + quantificationColumn, + pos, + global_qvalue_cutoff, + qvalue_cutoff, + pg_qvalue_cutoff) + readr::read_delim_chunked(input_file, + readr::DataFrameCallback$new(diann_chunk), + delim = delim, + chunk_size = 1e6) +} + +#' Clean a single chunk of DIANN data +#' +#' @param input Data frame chunk +#' @param output_path Path to output file +#' @param MBR Boolean, whether MBR was used +#' @param quantificationColumn Name of intensity column +#' @param pos Chunk position (1 for first chunk, >1 for subsequent) +#' @param global_qvalue_cutoff Global Q-value cutoff +#' @param qvalue_cutoff Q-value cutoff +#' @param pg_qvalue_cutoff Protein group Q-value cutoff +#' @return NULL +#' @keywords internal +cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos, + global_qvalue_cutoff = 0.01, + qvalue_cutoff = 0.01, + pg_qvalue_cutoff = 0.01) { + # 1. Handle "auto" quantification column + processed <- .handleAutoQuantification(input, quantificationColumn) + input <- processed$input + quantificationColumn <- processed$quantificationColumn + + # 2. Select required columns + input <- .selectDIANNColumns(input, MBR, quantificationColumn) + input <- .cleanDIANNAddMissingColumns(input) + + # 3. Expand concatenated rows + input <- .expandDIANNRows(input, quantificationColumn) + + # 4. Process fragment info (extract intensity, charge, ion) + input <- .processDIANNFragmentInfo(input, quantificationColumn) + + # 5. Filter invalid fragments + input <- .filterDIANNFragments(input, quantificationColumn) + + # 6. Standardize column names + input <- .standardizeDIANNColumns(input, quantificationColumn) + + # 7. Filter by Q-values + input <- .filterDIANNByQValues(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) + + # 8. Finalize columns (select final set, add IsotopeLabelType) + input <- .finalizeDIANNColumns(input) + + # 9. Write to output + .writeDIANNChunk(input, output_path, pos) + + NULL +} + +#' Handle automatic detection of quantification columns +#' +#' @param input Data frame +#' @param quantificationColumn Name of column or "auto" +#' @return List with input data frame and updated quantification column name +#' @keywords internal +.handleAutoQuantification <- function(input, quantificationColumn) { + if (quantificationColumn == "auto") { + fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE) + if (length(fragment_columns) == 0) { + stop("No fragment quantification columns found. Please check your input.") + } + input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";") + quantificationColumn <- "FragmentQuantCorrected" + } + list(input = input, quantificationColumn = quantificationColumn) +} + +#' Select required columns from DIANN output +#' +#' @param input Data frame +#' @param MBR Boolean +#' @param quantificationColumn Name of intensity column +#' @return Data frame with selected columns +#' @keywords internal +.selectDIANNColumns <- function(input, MBR, quantificationColumn) { + base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', + 'Precursor.Charge', quantificationColumn, 'Q.Value', + 'Precursor.Mz', 'Fragment.Info', 'Run') + + mbr_cols <- if (MBR) { + c('Lib.Q.Value', 'Lib.PG.Q.Value') + } else { + c('Global.Q.Value', 'Global.PG.Q.Value') + } + + req_cols <- intersect(c(base_cols, mbr_cols), colnames(input)) + dplyr::select(input, all_of(req_cols)) +} + +#' Add missing required columns +#' +#' @param input Data frame +#' @return Data frame with missing columns added +#' @keywords internal +.cleanDIANNAddMissingColumns <- function(input) { + if (!"Precursor.Mz" %in% colnames(input)) { + input <- dplyr::mutate(input, Precursor.Mz = NA) + } + if (!"Fragment.Info" %in% colnames(input)) { + input <- dplyr::mutate(input, Fragment.Info = NA) + } + input +} + +#' Expand rows with multiple fragments +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with expanded rows +#' @keywords internal +.expandDIANNRows <- function(input, quantificationColumn) { + split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input)) + if (length(split_cols) > 0) { + tidyr::separate_rows(input, all_of(split_cols), sep = ";") + } else { + input + } +} + +#' Process fragment information strings +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with FragmentIon and ProductCharge columns +#' @keywords internal +.processDIANNFragmentInfo <- function(input, quantificationColumn) { + # Convert Intensity to Numeric from Char strings + input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]]) + + # Generate fragment info if missing + if (all(is.na(input$Fragment.Info))) { + input <- dplyr::group_by(input, Protein.Names, Modified.Sequence, Precursor.Charge, Run) + input <- dplyr::mutate(input, Fragment.Info = paste0("Frag", dplyr::row_number())) + input <- dplyr::ungroup(input) + } + + dplyr::mutate( + input, + FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info), + + # Extract product charge + ProductCharge = dplyr::if_else( + grepl("/", .data$Fragment.Info), + # Extract charge (number right after "/" in string), default to 1 if parsing fails + as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")), + 1L, + missing = 1L + ) + ) +} + +#' Filter invalid fragments +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Filtered data frame +#' @keywords internal +.filterDIANNFragments <- function(input, quantificationColumn) { + dplyr::filter( + input, + (!grepl("NH3|H2O", .data$FragmentIon) | is.na(.data$FragmentIon)) & !is.na(.data[[quantificationColumn]]) + ) +} + +#' Standardize column names to MSstats format +#' +#' @param input Data frame +#' @param quantificationColumn Name of intensity column +#' @return Data frame with renamed columns +#' @keywords internal +.standardizeDIANNColumns <- function(input, quantificationColumn) { + input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x)) + + # Standardize column names + clean_quant_col <- gsub("\\.", "", quantificationColumn) + old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence', + 'PrecursorCharge', clean_quant_col, 'QValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence', + 'PrecursorCharge', 'Intensity', 'DetectionQValue', + 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge') + + current_names <- colnames(input) + names_to_rename <- intersect(current_names, old_names) + + # Create a named vector for renaming in the format c(new_name = old_name) + new_names_subset <- new_names[match(names_to_rename, old_names)] + rename_map <- setNames(names_to_rename, new_names_subset) + rename_map <- rename_map[!is.na(names(rename_map))] + + dplyr::rename(input, any_of(rename_map)) +} + +#' Filter data by Q-values +#' +#' @param input Data frame +#' @param MBR Boolean +#' @param global_qvalue_cutoff Numeric +#' @param qvalue_cutoff Numeric +#' @param pg_qvalue_cutoff Numeric +#' @return Filtered data frame +#' @keywords internal +.filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) { + input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff) + + if (MBR) { + dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff) + } else { + dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff) + } +} + +#' Finalize columns for output +#' +#' @param input Data frame +#' @return Data frame with final columns +#' @keywords internal +.finalizeDIANNColumns <- function(input) { + # Final column selection for MSstats format + msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", "Run", "Intensity") + + + # Add annotation columns if they exist + if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition") + if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate") + + # Add IsotopeLabelType, assuming Light for DIANN + input$IsotopeLabelType <- "L" + msstats_cols <- c(msstats_cols, "IsotopeLabelType") + + final_cols <- intersect(msstats_cols, colnames(input)) + dplyr::select(input, all_of(final_cols)) +} + +#' Write chunk to file +#' +#' @param input Data frame +#' @param output_path Path to output file +#' @param pos Chunk position +#' @return NULL +#' @keywords internal +.writeDIANNChunk <- function(input, output_path, pos) { + # Write to file + if (!is.null(pos)) { + if (pos == 1) { + readr::write_csv(input, file = output_path, append = FALSE) + } else { + readr::write_csv(input, file = output_path, append = TRUE) + } + } +} \ No newline at end of file diff --git a/R/converters.R b/R/converters.R index 90bfca1..a96fead 100644 --- a/R/converters.R +++ b/R/converters.R @@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, } +#' Convert out-of-memory DIANN files to MSstats format. +#' +#' @inheritParams MSstatsPreprocessBig +#' @param MBR True if analysis was done with match between runs. +#' @param quantificationColumn Use 'Fragment.Quant.Corrected'(default) column for quantified intensities for DIANN 1.8.x. +#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +#' +#' @export +#' +#' @return either arrow object or sparklyr table that can be optionally collected +#' into memory by using dplyr::collect function. +#' +bigDIANNtoMSstatsFormat <- function(input_file, + output_file_name, + backend, + MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", + max_feature_count = 100, + filter_unique_peptides = FALSE, + aggregate_psms = FALSE, + filter_few_obs = FALSE, + remove_annotation = FALSE, + calculateAnomalyScores=FALSE, + anomalyModelFeatures=c(), + connection = NULL) { + + # Reduce and clean the DIANN report file in chunks + reduceBigDIANN(input_file, + paste0("reduce_output_", output_file_name), + MBR, + quantificationColumn) + + # Preprocess the cleaned data (feature selection, etc.) + msstats_data <- MSstatsPreprocessBig( + paste0("reduce_output_", output_file_name), + output_file_name, backend, max_feature_count, + filter_unique_peptides, aggregate_psms, filter_few_obs, + remove_annotation, calculateAnomalyScores, + anomalyModelFeatures, connection) + + return(msstats_data) +} + + #' Merge annotation to output of MSstatsPreprocessBig #' #' @param input output of MSstatsPreprocessBig @@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, MSstatsAddAnnotationBig <- function(input, annotation) { dplyr::inner_join(input, annotation, by = "Run") } - diff --git a/man/bigDIANNtoMSstatsFormat.Rd b/man/bigDIANNtoMSstatsFormat.Rd new file mode 100644 index 0000000..e9d6360 --- /dev/null +++ b/man/bigDIANNtoMSstatsFormat.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/converters.R +\name{bigDIANNtoMSstatsFormat} +\alias{bigDIANNtoMSstatsFormat} +\title{Convert out-of-memory DIANN files to MSstats format.} +\usage{ +bigDIANNtoMSstatsFormat( + input_file, + output_file_name, + backend, + MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", + max_feature_count = 100, + filter_unique_peptides = FALSE, + aggregate_psms = FALSE, + filter_few_obs = FALSE, + remove_annotation = FALSE, + calculateAnomalyScores = FALSE, + anomalyModelFeatures = c(), + connection = NULL +) +} +\arguments{ +\item{input_file}{name of the input text file in 10-column MSstats format.} + +\item{output_file_name}{name of an output file which will be saved after pre-processing} + +\item{backend}{"arrow" or "sparklyr". Option "sparklyr" requires a spark installation +and connection to spark instance provided in the `connection` parameter.} + +\item{MBR}{True if analysis was done with match between runs.} + +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.} + +\item{max_feature_count}{maximum number of features per protein. Features will +be selected based on highest average intensity.} + +\item{filter_unique_peptides}{If TRUE, shared peptides will be removed. +Please refer to the `Details` section for additional information.} + +\item{aggregate_psms}{If TRUE, multiple measurements per PSM in a Run will +be aggregated (by taking maximum value). Please refer to the `Details` section for additional information.} + +\item{filter_few_obs}{If TRUE, feature with less than 3 observations across runs will be removed. +Please refer to the `Details` section for additional information.} + +\item{remove_annotation}{If TRUE, columns BioReplicate and Condition will be removed +to reduce output file size. These will need to be added manually later before +using dataProcess function. Only applicable to sparklyr backend.} + +\item{calculateAnomalyScores}{If TRUE, will carry anomaly model features through pipeline} + +\item{anomalyModelFeatures}{Character vector of column names to be carried through the pipeline} + +\item{connection}{Connection to a spark instance created with the +`spark_connect` function from `sparklyr` package.} +} +\value{ +either arrow object or sparklyr table that can be optionally collected +into memory by using dplyr::collect function. +} +\description{ +Convert out-of-memory DIANN files to MSstats format. +} diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R new file mode 100644 index 0000000..8fee265 --- /dev/null +++ b/tests/testthat/test-converters.R @@ -0,0 +1,95 @@ +library(testthat) +library(mockery) + +context("General converter functions") + +test_that("MSstatsAddAnnotationBig adds annotation correctly", { + input_data <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Intensity = c(100, 200, 300) + ) + + annotation_data <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Condition = c("A", "A", "B"), + BioReplicate = c(1, 2, 1) + ) + + expected_output <- data.frame( + Run = c("Run1", "Run2", "Run3"), + Intensity = c(100, 200, 300), + Condition = c("A", "A", "B"), + BioReplicate = c(1, 2, 1) + ) + + result <- MSstatsAddAnnotationBig(input_data, annotation_data) + + expect_equal(result, expected_output) +}) + +test_that("MSstatsPreprocessBig performs feature selection correctly", { + input_file <- tempfile(fileext = ".csv") + output_file <- "preprocess_output.csv" + + # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity. + # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity. + msstats_data <- rbind( + data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)), + data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850)) + ) + readr::write_csv(msstats_data, input_file) + + processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow", + max_feature_count = 1) + result <- dplyr::collect(processed) + + # For P1, frag3 should be selected. For P2, fragB should be selected. + expect_equal(nrow(result), 4) + + p1_result <- result[result$ProteinName == "P1", ] + expect_equal(nrow(p1_result), 2) + expect_true(all(p1_result$FragmentIon == "frag3")) + + p2_result <- result[result$ProteinName == "P2", ] + expect_equal(nrow(p2_result), 2) + expect_true(all(p2_result$FragmentIon == "fragB")) + + # Cleanup + file.remove(input_file) + if (file.exists(output_file)) file.remove(output_file) +}) + +test_that("bigSpectronauttoMSstatsFormat works correctly", { + # Mock reduceBigSpectronaut as its source is not provided + mock_reduce <- mock(NULL) + + stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { + msstats_data <- data.frame( + ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, + FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1, + IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2), + Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher + ) + readr::write_csv(msstats_data, output_path) + }) + + input_file <- "dummy_spectro_input.csv" + output_file <- "spectro_output.csv" + + processed <- bigSpectronauttoMSstatsFormat( + input_file = input_file, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 1 + ) + result <- dplyr::collect(processed) + + # The mock reduce function creates a file with 2 features for P1. + # max_feature_count = 1 should select frag2. + expect_equal(nrow(result), 2) + expect_true(all(result$FragmentIon == "frag2")) + + # Cleanup + if (file.exists(output_file)) file.remove(output_file) + if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) +}) \ No newline at end of file diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R new file mode 100644 index 0000000..81b82cf --- /dev/null +++ b/tests/testthat/test-diann_converter.R @@ -0,0 +1,178 @@ +library(testthat) + +context("DIANN converter functions") + +# Test for the internal cleanDIANNChunk function +test_that("cleanDIANNChunk processes data correctly", { + output_file <- tempfile(fileext = ".csv") + + diann_chunk_data <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE(mod)", + Precursor.Charge = 2, + Fragment.Quant.Corrected = "100;200", + Q.Value = 0.005, + Precursor.Mz = 400.5, + Fragment.Info = "y7^1/1;b3-H2O^1/1", # One valid, one to be filtered + Lib.Q.Value = 0.001, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + # The function is not exported, so we use ::: + MSstatsBig:::cleanDIANNChunk(diann_chunk_data, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 1) + expect_equal(result$ProteinName, "ProteinA") + expect_equal(result$PeptideSequence, "PEPTIDE") + expect_equal(result$Intensity, 100) + expect_equal(result$FragmentIon, "y7^1/1") + expect_equal(result$ProductCharge, 1) + expect_equal(result$IsotopeLabelType, "L") + expect_true("PeptideModifiedSequence" %in% colnames(result)) + + file.remove(output_file) +}) + +test_that("cleanDIANNChunk handles 'auto' quantification column correctly", { + output_file <- tempfile(fileext = ".csv") + + # Data with wide format fragment quantification + diann_chunk_wide <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE", + Precursor.Charge = 2, + Fr1Quantity = 100, + Fr2Quantity = 200, + Q.Value = 0.005, + Precursor.Mz = 400.5, + Fragment.Info = "y1^1/1;y2^1/1", + Lib.Q.Value = 0.001, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + MSstatsBig:::cleanDIANNChunk(diann_chunk_wide, output_file, MBR = TRUE, + quantificationColumn = "auto", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 2) + expect_equal(sort(result$Intensity), c(100, 200)) + expect_equal(sort(result$FragmentIon), c("y1^1/1", "y2^1/1")) + + file.remove(output_file) + + # Test error when columns are missing + diann_chunk_missing <- diann_chunk_wide[, !grepl("Quantity", names(diann_chunk_wide))] + expect_error(MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE, + quantificationColumn = "auto", pos = 1), + "No fragment quantification columns found") +}) + +test_that("cleanDIANNChunk handles missing Fragment.Info by defaulting ProductCharge to 1", { + output_file <- tempfile(fileext = ".csv") + + # Data with missing Fragment.Info (simulating it not being present) + diann_chunk_missing <- data.frame( + Run = "run1", + Protein.Names = "ProteinA", + Stripped.Sequence = "PEPTIDE", + Modified.Sequence = "PEPTIDE", + Precursor.Charge = 2, + Fragment.Quant.Corrected = 100, + Q.Value = 0.005, + Precursor.Mz = 400.5, + # Fragment.Info is missing + Lib.Q.Value = 0.001, + Lib.PG.Q.Value = 0.001, + stringsAsFactors = FALSE + ) + + MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected", pos = 1) + + result <- read.csv(output_file) + + expect_equal(nrow(result), 1) + expect_equal(result$ProductCharge, 1) + expect_equal(result$FragmentIon, "Frag1") + + file.remove(output_file) +}) + +# Test for the internal reduceBigDIANN function +test_that("reduceBigDIANN processes a file correctly", { + input_file <- tempfile(fileext = ".csv") + output_file <- tempfile(fileext = ".csv") + + diann_data <- data.frame( + Run = c("run1", "run1"), + Protein.Names = c("ProteinA", "ProteinB"), + Stripped.Sequence = c("PEPTIDE_A", "PEPTIDE_B"), + Modified.Sequence = c("PEPTIDE_A(mod)", "PEPTIDE_B"), + Precursor.Charge = c(2, 3), + Fragment.Quant.Corrected = c("100;200", "300"), + Q.Value = c(0.005, 0.006), + Precursor.Mz = c(400.5, 500.5), + Fragment.Info = c("y7^1/1;b3-H2O^1/1", "y5^1/2"), + Lib.Q.Value = c(0.001, 0.002), + Lib.PG.Q.Value = c(0.001, 0.002), + stringsAsFactors = FALSE + ) + write.csv(diann_data, input_file, row.names = FALSE) + + MSstatsBig:::reduceBigDIANN(input_file, output_file, MBR = TRUE, + quantificationColumn = "Fragment.Quant.Corrected") + + result <- read.csv(output_file) + expect_equal(nrow(result), 2) + expect_equal(result$Intensity, c(100, 300)) + expect_equal(result$ProteinName, c("ProteinA", "ProteinB")) + expect_equal(result$ProductCharge, c(1, 2)) + expect_equal(result$FragmentIon, c("y7^1/1", "y5^1/2")) + + file.remove(input_file) + file.remove(output_file) +}) + +# End-to-end test for bigDIANNtoMSstatsFormat +test_that("bigDIANNtoMSstatsFormat works with arrow backend", { + input_file <- tempfile(fileext = ".csv") + output_file <- basename(tempfile(fileext = ".csv")) + + # 4 features for one protein. Feature selection should pick the top 2. + diann_data <- rbind( + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001), + data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001) + ) + write.csv(diann_data, input_file, row.names = FALSE) + + converted <- bigDIANNtoMSstatsFormat( + input_file = input_file, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 2 + ) + result <- dplyr::collect(converted) + + # Avg intensities: y1=1050, y2=550, y3=100, y4=2050. + # Top 2 features are y4 and y1. + expect_equal(nrow(result), 4) # 2 features * 2 runs + expect_true(all(c("y1", "y4") %in% unique(result$FragmentIon))) + expect_false(any(c("y2", "y3") %in% unique(result$FragmentIon))) + + # Cleanup + file.remove(input_file) + if (file.exists(output_file)) file.remove(output_file) + if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file)) +}) \ No newline at end of file diff --git a/tests/testthat/topN_preprocess_output.csv b/tests/testthat/topN_preprocess_output.csv new file mode 100644 index 0000000..d9afa98 --- /dev/null +++ b/tests/testthat/topN_preprocess_output.csv @@ -0,0 +1,5 @@ +"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity" +"P1","PEPTIDE",2,"frag3",1,"L","A",1,"run1",2000 +"P1","PEPTIDE",2,"frag3",1,"L","A",2,"run2",2100 +"P2","PEPTIDE2",3,"fragB",1,"L","B",1,"run1",800 +"P2","PEPTIDE2",3,"fragB",1,"L","B",2,"run2",850 diff --git a/tests/testthat/topN_spectro_output.csv b/tests/testthat/topN_spectro_output.csv new file mode 100644 index 0000000..b103ba2 --- /dev/null +++ b/tests/testthat/topN_spectro_output.csv @@ -0,0 +1,3 @@ +"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity" +"P1","PEPTIDE",2,"frag2",1,"L","A",1,"run1",2000 +"P1","PEPTIDE",2,"frag2",1,"L","A",2,"run2",2100 diff --git a/tests/testthat/topN_test_diann_output.csv b/tests/testthat/topN_test_diann_output.csv new file mode 100644 index 0000000..95d41b3 --- /dev/null +++ b/tests/testthat/topN_test_diann_output.csv @@ -0,0 +1,5 @@ +"ProteinName","PeptideSequence","PeptideModifiedSequence","PrecursorCharge","FragmentIon","ProductCharge","Run","Intensity","IsotopeLabelType" +"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r1",1000,"L" +"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r2",1100,"L" +"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r1",2000,"L" +"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r2",2100,"L"