From 1dfd8f095ee947e1008173f304ddba90a0759bd9 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 30 Dec 2025 14:49:21 -0500
Subject: [PATCH 1/8] Implementing and Testing DIANN converter for MSstatsBIG.

---
 DESCRIPTION                               |   6 +-
 NAMESPACE                                 |   1 +
 R/clean_DIANN.R                           | 113 ++++++++++++++++++++++
 R/converters.R                            |  45 ++++++++-
 man/bigDIANNtoMSstatsFormat.Rd            |  65 +++++++++++++
 tests/testthat/test-converters.R          |  95 ++++++++++++++++++
 tests/testthat/test-diann_converter.R     | 109 +++++++++++++++++++++
 tests/testthat/topN_preprocess_output.csv |   5 +
 tests/testthat/topN_spectro_output.csv    |   3 +
 tests/testthat/topN_test_diann_output.csv |   5 +
 10 files changed, 444 insertions(+), 3 deletions(-)
 create mode 100644 R/clean_DIANN.R
 create mode 100644 man/bigDIANNtoMSstatsFormat.Rd
 create mode 100644 tests/testthat/test-converters.R
 create mode 100644 tests/testthat/test-diann_converter.R
 create mode 100644 tests/testthat/topN_preprocess_output.csv
 create mode 100644 tests/testthat/topN_spectro_output.csv
 create mode 100644 tests/testthat/topN_test_diann_output.csv

diff --git a/DESCRIPTION b/DESCRIPTION
index ee2ee1c..e695441 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -13,7 +13,7 @@ Description: MSstats package provide tools for preprocessing, summarization and
     processing larger than memory data sets.
 License: Artistic-2.0
 Encoding: UTF-8
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Imports: 
     arrow,
     DBI,
@@ -22,7 +22,9 @@ Imports:
     MSstatsConvert,
     readr,
     sparklyr,
-    utils
+    utils,
+    testthat,
+    mockery
 Suggests: 
     knitr,
     rmarkdown
diff --git a/NAMESPACE b/NAMESPACE
index 6aaf83d..3c5047a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(MSstatsAddAnnotationBig)
 export(MSstatsPreprocessBig)
+export(bigDIANNtoMSstatsFormat)
 export(bigFragPipetoMSstatsFormat)
 export(bigSpectronauttoMSstatsFormat)
 importFrom(MSstats,dataProcess)
diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
new file mode 100644
index 0000000..74c6736
--- /dev/null
+++ b/R/clean_DIANN.R
@@ -0,0 +1,113 @@
+#' @keywords internal
+reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
+                           quantificationColumn = "FragmentQuantCorrected") {
+  if (grepl("csv", input_file)) {
+    delim = ","
+  } else if (grepl("tsv|xls", input_file)) {
+    delim = "\t"
+  } else {
+    delim <- ";"
+  }
+  
+  diann_chunk <- function(x, pos) cleanDIANNChunk(x,
+                                                  output_path,
+                                                  MBR,
+                                                  quantificationColumn,
+                                                  pos)
+  readr::read_delim_chunked(input_file,
+                            readr::DataFrameCallback$new(diann_chunk),
+                            delim = delim,
+                            chunk_size = 1e6)
+}
+
+#' @keywords internal
+cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {
+  
+  # 1. Select required columns
+  base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 
+                 'Precursor.Charge', quantificationColumn, 'Q.Value', 
+                 'Precursor.Mz', 'Fragment.Info', 'Run')
+  
+  mbr_cols <- if (MBR) {
+    c('Lib.Q.Value', 'Lib.PG.Q.Value')
+  } else {
+    c('Global.Q.Value', 'Global.PG.Q.Value')
+  }
+  
+  req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
+  input <- dplyr::select(input, all_of(req_cols))
+  
+  # 2. Split concatenated values (un-nest)
+  split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
+  if (length(split_cols) > 0) {
+    input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";")
+  }
+  
+  # 3. Process fragment information
+  input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])
+  
+  input <- dplyr::mutate(
+    input,
+    FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
+    ProductCharge = dplyr::if_else(
+      grepl("/", .data$Fragment.Info),
+      # Extract charge, default to 1 if parsing fails
+      as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
+      1L
+    )
+  )
+  
+  # 4. Clean and filter data
+  input <- dplyr::filter(
+    input,
+    !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
+  )
+  
+  # 5. Rename columns to MSstats standard
+  input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))
+  
+  # Standardize column names
+  old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
+                 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue',
+                 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
+  new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
+                 'PrecursorCharge', 'Intensity', 'DetectionQValue', 
+                 'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
+  
+  current_names <- colnames(input)
+  names_to_rename <- intersect(current_names, old_names)
+
+  # Create a named vector for renaming in the format c(new_name = old_name)
+  new_names_subset <- new_names[match(names_to_rename, old_names)]
+  rename_map <- setNames(names_to_rename, new_names_subset)
+  rename_map <- rename_map[!is.na(names(rename_map))]
+
+  input <- dplyr::rename(input, any_of(rename_map))
+  
+  # Final column selection for MSstats format
+  msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", 
+                    "FragmentIon", "ProductCharge", "Run", "Intensity")
+  
+  #TODO: confirm with Tony -- are these three needed?
+  
+  # Add annotation columns if they exist
+  if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
+  if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")
+   
+  # Add IsotopeLabelType, assuming Light for DIANN
+  input$IsotopeLabelType <- "L"
+  msstats_cols <- c(msstats_cols, "IsotopeLabelType")
+  
+  final_cols <- intersect(msstats_cols, colnames(input))
+  input <- dplyr::select(input, all_of(final_cols))
+  
+  # Write to file
+  if (!is.null(pos)) {
+    if (pos == 1) {
+      readr::write_csv(input, file = output_path, append = FALSE)
+    } else {
+      readr::write_csv(input, file = output_path, append = TRUE)
+    }
+  }
+  NULL
+}
\ No newline at end of file
diff --git a/R/converters.R b/R/converters.R
index 90bfca1..f66fa50 100644
--- a/R/converters.R
+++ b/R/converters.R
@@ -155,6 +155,50 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
 }
 
 
+#' Convert out-of-memory DIANN files to MSstats format.
+#'
+#' @inheritParams MSstatsPreprocessBig
+#' @param MBR True if analysis was done with match between runs.
+#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
+#' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
+#'
+#' @export
+#'
+#' @return either arrow object or sparklyr table that can be optionally collected
+#' into memory by using dplyr::collect function.
+#'
+bigDIANNtoMSstatsFormat <- function(input_file, 
+                                    output_file_name,
+                                    backend,
+                                    MBR = TRUE,
+                                    quantificationColumn = "Fragment.Quant.Corrected",
+                                    max_feature_count = 100,
+                                    filter_unique_peptides =  FALSE,
+                                    aggregate_psms =  FALSE,
+                                    filter_few_obs =  FALSE,
+                                    remove_annotation =  FALSE,
+                                    calculateAnomalyScores=FALSE, 
+                                    anomalyModelFeatures=c(),
+                                    connection =  NULL) {
+  
+  # Reduce and clean the DIANN report file in chunks
+  reduceBigDIANN(input_file, 
+                 paste0("reduce_output_", output_file_name),
+                 MBR,
+                 quantificationColumn)
+  
+  # Preprocess the cleaned data (feature selection, etc.)
+  msstats_data <- MSstatsPreprocessBig(
+    paste0("reduce_output_", output_file_name),
+    output_file_name, backend, max_feature_count,
+    filter_unique_peptides, aggregate_psms, filter_few_obs, 
+    remove_annotation, calculateAnomalyScores, 
+    anomalyModelFeatures, connection)
+  
+  return(msstats_data)
+}
+
+
 #' Merge annotation to output of MSstatsPreprocessBig
 #'
 #' @param input output of MSstatsPreprocessBig
@@ -185,4 +229,3 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
 MSstatsAddAnnotationBig <- function(input, annotation) {
   dplyr::inner_join(input, annotation, by = "Run")
 }
-
diff --git a/man/bigDIANNtoMSstatsFormat.Rd b/man/bigDIANNtoMSstatsFormat.Rd
new file mode 100644
index 0000000..e9d6360
--- /dev/null
+++ b/man/bigDIANNtoMSstatsFormat.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/converters.R
+\name{bigDIANNtoMSstatsFormat}
+\alias{bigDIANNtoMSstatsFormat}
+\title{Convert out-of-memory DIANN files to MSstats format.}
+\usage{
+bigDIANNtoMSstatsFormat(
+  input_file,
+  output_file_name,
+  backend,
+  MBR = TRUE,
+  quantificationColumn = "Fragment.Quant.Corrected",
+  max_feature_count = 100,
+  filter_unique_peptides = FALSE,
+  aggregate_psms = FALSE,
+  filter_few_obs = FALSE,
+  remove_annotation = FALSE,
+  calculateAnomalyScores = FALSE,
+  anomalyModelFeatures = c(),
+  connection = NULL
+)
+}
+\arguments{
+\item{input_file}{name of the input text file in 10-column MSstats format.}
+
+\item{output_file_name}{name of an output file which will be saved after pre-processing}
+
+\item{backend}{"arrow" or "sparklyr". Option "sparklyr" requires a spark installation
+and connection to spark instance provided in the `connection` parameter.}
+
+\item{MBR}{True if analysis was done with match between runs.}
+
+\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
+Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x.}
+
+\item{max_feature_count}{maximum number of features per protein. Features will
+be selected based on highest average intensity.}
+
+\item{filter_unique_peptides}{If TRUE, shared peptides will be removed.
+Please refer to the `Details` section for additional information.}
+
+\item{aggregate_psms}{If TRUE, multiple measurements per PSM in a Run will
+be aggregated (by taking maximum value). Please refer to the `Details` section for additional information.}
+
+\item{filter_few_obs}{If TRUE, feature with less than 3 observations across runs will be removed.
+Please refer to the `Details` section for additional information.}
+
+\item{remove_annotation}{If TRUE, columns BioReplicate and Condition will be removed
+to reduce output file size. These will need to be added manually later before
+using dataProcess function. Only applicable to sparklyr backend.}
+
+\item{calculateAnomalyScores}{If TRUE, will carry anomaly model features through pipeline}
+
+\item{anomalyModelFeatures}{Character vector of column names to be carried through the pipeline}
+
+\item{connection}{Connection to a spark instance created with the
+`spark_connect` function from `sparklyr` package.}
+}
+\value{
+either arrow object or sparklyr table that can be optionally collected
+into memory by using dplyr::collect function.
+}
+\description{
+Convert out-of-memory DIANN files to MSstats format.
+}
diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R
new file mode 100644
index 0000000..8fee265
--- /dev/null
+++ b/tests/testthat/test-converters.R
@@ -0,0 +1,95 @@
+library(testthat)
+library(mockery)
+
+context("General converter functions")
+
+test_that("MSstatsAddAnnotationBig adds annotation correctly", {
+  input_data <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Intensity = c(100, 200, 300)
+  )
+
+  annotation_data <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Condition = c("A", "A", "B"),
+    BioReplicate = c(1, 2, 1)
+  )
+
+  expected_output <- data.frame(
+    Run = c("Run1", "Run2", "Run3"),
+    Intensity = c(100, 200, 300),
+    Condition = c("A", "A", "B"),
+    BioReplicate = c(1, 2, 1)
+  )
+
+  result <- MSstatsAddAnnotationBig(input_data, annotation_data)
+
+  expect_equal(result, expected_output)
+})
+
+test_that("MSstatsPreprocessBig performs feature selection correctly", {
+  input_file <- tempfile(fileext = ".csv")
+  output_file <- "preprocess_output.csv"
+
+  # P1 has 3 features (frag1, frag2, frag3). frag3 has the highest avg intensity.
+  # P2 has 2 features (fragA, fragB). fragB has the highest avg intensity.
+  msstats_data <- rbind(
+    data.frame(ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, FragmentIon = rep(c("frag1", "frag2", "frag3"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 3), Run = rep(c("run1", "run2"), 3), Intensity = c(1000, 1100, 500, 550, 2000, 2100)),
+    data.frame(ProteinName = "P2", PeptideSequence = "PEPTIDE2", PrecursorCharge = 3, FragmentIon = rep(c("fragA", "fragB"), each = 2), ProductCharge = 1, IsotopeLabelType = "L", Condition = "B", BioReplicate = rep(1:2, 2), Run = rep(c("run1", "run2"), 2), Intensity = c(100, 150, 800, 850))
+  )
+  readr::write_csv(msstats_data, input_file)
+
+  processed <- MSstatsPreprocessBig(input_file, output_file, backend = "arrow",
+                                    max_feature_count = 1)
+  result <- dplyr::collect(processed)
+
+  # For P1, frag3 should be selected. For P2, fragB should be selected.
+  expect_equal(nrow(result), 4)
+
+  p1_result <- result[result$ProteinName == "P1", ]
+  expect_equal(nrow(p1_result), 2)
+  expect_true(all(p1_result$FragmentIon == "frag3"))
+
+  p2_result <- result[result$ProteinName == "P2", ]
+  expect_equal(nrow(p2_result), 2)
+  expect_true(all(p2_result$FragmentIon == "fragB"))
+
+  # Cleanup
+  file.remove(input_file)
+  if (file.exists(output_file)) file.remove(output_file)
+})
+
+test_that("bigSpectronauttoMSstatsFormat works correctly", {
+  # Mock reduceBigSpectronaut as its source is not provided
+  mock_reduce <- mock(NULL)
+
+  stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
+    msstats_data <- data.frame(
+      ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
+      FragmentIon = rep(c("frag1", "frag2"), each = 2), ProductCharge = 1,
+      IsotopeLabelType = "L", Condition = "A", BioReplicate = rep(1:2, 2),
+      Run = rep(c("run1", "run2"), 2), Intensity = c(1000, 1100, 2000, 2100) # frag2 is higher
+    )
+    readr::write_csv(msstats_data, output_path)
+  })
+
+  input_file <- "dummy_spectro_input.csv"
+  output_file <- "spectro_output.csv"
+
+  processed <- bigSpectronauttoMSstatsFormat(
+    input_file = input_file,
+    output_file_name = output_file,
+    backend = "arrow",
+    max_feature_count = 1
+  )
+  result <- dplyr::collect(processed)
+
+  # The mock reduce function creates a file with 2 features for P1.
+  # max_feature_count = 1 should select frag2.
+  expect_equal(nrow(result), 2)
+  expect_true(all(result$FragmentIon == "frag2"))
+
+  # Cleanup
+  if (file.exists(output_file)) file.remove(output_file)
+  if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file))
+})
\ No newline at end of file
diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R
new file mode 100644
index 0000000..358ff49
--- /dev/null
+++ b/tests/testthat/test-diann_converter.R
@@ -0,0 +1,109 @@
+library(testthat)
+
+context("DIANN converter functions")
+
+# Test for the internal cleanDIANNChunk function
+test_that("cleanDIANNChunk processes data correctly", {
+  output_file <- tempfile(fileext = ".csv")
+
+  diann_chunk_data <- data.frame(
+    Run = "run1",
+    Protein.Names = "ProteinA",
+    Stripped.Sequence = "PEPTIDE",
+    Modified.Sequence = "PEPTIDE(mod)",
+    Precursor.Charge = 2,
+    Fragment.Quant.Corrected = "100;200",
+    Q.Value = 0.005,
+    Precursor.Mz = 400.5,
+    Fragment.Info = "y7^1/1;b3-H2O^1/1", # One valid, one to be filtered
+    Lib.Q.Value = 0.01,
+    Lib.PG.Q.Value = 0.001,
+    stringsAsFactors = FALSE
+  )
+
+  # The function is not exported, so we use :::
+  MSstatsBig:::cleanDIANNChunk(diann_chunk_data, output_file, MBR = TRUE,
+                               quantificationColumn = "Fragment.Quant.Corrected", pos = 1)
+
+  result <- read.csv(output_file)
+
+  expect_equal(nrow(result), 1)
+  expect_equal(result$ProteinName, "ProteinA")
+  expect_equal(result$PeptideSequence, "PEPTIDE")
+  expect_equal(result$Intensity, 100)
+  expect_equal(result$FragmentIon, "y7^1/1")
+  expect_equal(result$ProductCharge, 1)
+  expect_equal(result$IsotopeLabelType, "L")
+  expect_true("PeptideModifiedSequence" %in% colnames(result))
+
+  file.remove(output_file)
+})
+
+# Test for the internal reduceBigDIANN function
+test_that("reduceBigDIANN processes a file correctly", {
+  input_file <- tempfile(fileext = ".csv")
+  output_file <- tempfile(fileext = ".csv")
+
+  diann_data <- data.frame(
+    Run = c("run1", "run1"),
+    Protein.Names = c("ProteinA", "ProteinB"),
+    Stripped.Sequence = c("PEPTIDE_A", "PEPTIDE_B"),
+    Modified.Sequence = c("PEPTIDE_A(mod)", "PEPTIDE_B"),
+    Precursor.Charge = c(2, 3),
+    Fragment.Quant.Corrected = c("100;200", "300"),
+    Q.Value = c(0.005, 0.006),
+    Precursor.Mz = c(400.5, 500.5),
+    Fragment.Info = c("y7^1/1;b3-H2O^1/1", "y5^1/2"),
+    Lib.Q.Value = c(0.01, 0.02),
+    Lib.PG.Q.Value = c(0.001, 0.002),
+    stringsAsFactors = FALSE
+  )
+  write.csv(diann_data, input_file, row.names = FALSE)
+
+  MSstatsBig:::reduceBigDIANN(input_file, output_file, MBR = TRUE,
+                              quantificationColumn = "Fragment.Quant.Corrected")
+
+  result <- read.csv(output_file)
+  expect_equal(nrow(result), 2)
+  expect_equal(result$Intensity, c(100, 300))
+  expect_equal(result$ProteinName, c("ProteinA", "ProteinB"))
+  expect_equal(result$ProductCharge, c(1, 2))
+  expect_equal(result$FragmentIon, c("y7^1/1", "y5^1/2"))
+
+  file.remove(input_file)
+  file.remove(output_file)
+})
+
+# End-to-end test for bigDIANNtoMSstatsFormat
+test_that("bigDIANNtoMSstatsFormat works with arrow backend", {
+  input_file <- tempfile(fileext = ".csv")
+  output_file <- "test_diann_output.csv"
+
+  # 4 features for one protein. Feature selection should pick the top 2.
+  diann_data <- rbind(
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01)
+  )
+  write.csv(diann_data, input_file, row.names = FALSE)
+
+  converted <- bigDIANNtoMSstatsFormat(
+    input_file = input_file,
+    output_file_name = output_file,
+    backend = "arrow",
+    max_feature_count = 2
+  )
+  result <- dplyr::collect(converted)
+
+  # Avg intensities: y1=1050, y2=550, y3=100, y4=2050.
+  # Top 2 features are y4 and y1.
+  expect_equal(nrow(result), 4) # 2 features * 2 runs
+  expect_true(all(c("y1", "y4") %in% unique(result$FragmentIon)))
+  expect_false(any(c("y2", "y3") %in% unique(result$FragmentIon)))
+
+  # Cleanup
+  file.remove(input_file)
+  if (file.exists(output_file)) file.remove(output_file)
+  if (file.exists(paste0("reduce_output_", output_file))) file.remove(paste0("reduce_output_", output_file))
+})
\ No newline at end of file
diff --git a/tests/testthat/topN_preprocess_output.csv b/tests/testthat/topN_preprocess_output.csv
new file mode 100644
index 0000000..d9afa98
--- /dev/null
+++ b/tests/testthat/topN_preprocess_output.csv
@@ -0,0 +1,5 @@
+"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity"
+"P1","PEPTIDE",2,"frag3",1,"L","A",1,"run1",2000
+"P1","PEPTIDE",2,"frag3",1,"L","A",2,"run2",2100
+"P2","PEPTIDE2",3,"fragB",1,"L","B",1,"run1",800
+"P2","PEPTIDE2",3,"fragB",1,"L","B",2,"run2",850
diff --git a/tests/testthat/topN_spectro_output.csv b/tests/testthat/topN_spectro_output.csv
new file mode 100644
index 0000000..b103ba2
--- /dev/null
+++ b/tests/testthat/topN_spectro_output.csv
@@ -0,0 +1,3 @@
+"ProteinName","PeptideSequence","PrecursorCharge","FragmentIon","ProductCharge","IsotopeLabelType","Condition","BioReplicate","Run","Intensity"
+"P1","PEPTIDE",2,"frag2",1,"L","A",1,"run1",2000
+"P1","PEPTIDE",2,"frag2",1,"L","A",2,"run2",2100
diff --git a/tests/testthat/topN_test_diann_output.csv b/tests/testthat/topN_test_diann_output.csv
new file mode 100644
index 0000000..95d41b3
--- /dev/null
+++ b/tests/testthat/topN_test_diann_output.csv
@@ -0,0 +1,5 @@
+"ProteinName","PeptideSequence","PeptideModifiedSequence","PrecursorCharge","FragmentIon","ProductCharge","Run","Intensity","IsotopeLabelType"
+"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r1",1000,"L"
+"P1","PEPTIDE","PEPTIDE",2,"y1",1,"r2",1100,"L"
+"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r1",2000,"L"
+"P1","PEPTIDE","PEPTIDE",2,"y4",1,"r2",2100,"L"

From 005ced91c12b33fcda8e18311414dad937320465 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Mon, 19 Jan 2026 22:38:37 -0600
Subject: [PATCH 2/8] Responding to comments

---
 DESCRIPTION                           |  6 +++---
 R/clean_DIANN.R                       | 29 +++++++++++++++++++++++----
 R/converters.R                        |  2 +-
 tests/testthat/test-diann_converter.R | 12 +++++------
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e695441..d71d7da 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,10 +22,10 @@ Imports:
     MSstatsConvert,
     readr,
     sparklyr,
-    utils,
+    utils
+Suggests:
     testthat,
-    mockery
-Suggests: 
+    mockery,
     knitr,
     rmarkdown
 VignetteBuilder: knitr
diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index 74c6736..260fc6b 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -1,6 +1,9 @@
 #' @keywords internal
 reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
-                           quantificationColumn = "FragmentQuantCorrected") {
+                           quantificationColumn = "FragmentQuantCorrected",
+                           global_qvalue_cutoff = 0.01,
+                           qvalue_cutoff = 0.01,
+                           pg_qvalue_cutoff = 0.01) {
   if (grepl("csv", input_file)) {
     delim = ","
   } else if (grepl("tsv|xls", input_file)) {
@@ -13,7 +16,10 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
                                                   output_path,
                                                   MBR,
                                                   quantificationColumn,
-                                                  pos)
+                                                  pos,
+                                                  global_qvalue_cutoff,
+                                                  qvalue_cutoff,
+                                                  pg_qvalue_cutoff)
   readr::read_delim_chunked(input_file,
                             readr::DataFrameCallback$new(diann_chunk),
                             delim = delim,
@@ -21,7 +27,10 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
 }
 
 #' @keywords internal
-cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {
+cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
+                           global_qvalue_cutoff = 0.01,
+                           qvalue_cutoff = 0.01,
+                           pg_qvalue_cutoff = 0.01) {
   
   # 1. Select required columns
   base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 
@@ -44,14 +53,18 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {
   }
   
   # 3. Process fragment information
+
+  #Convert Intensity to Numeric from Char strings
   input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])
   
   input <- dplyr::mutate(
     input,
     FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
+
+    # Extract product charge
     ProductCharge = dplyr::if_else(
       grepl("/", .data$Fragment.Info),
-      # Extract charge, default to 1 if parsing fails
+      # Extract charge (number right after "/" in string), default to 1 if parsing fails
       as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
       1L
     )
@@ -83,6 +96,14 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos) {
   rename_map <- rename_map[!is.na(names(rename_map))]
 
   input <- dplyr::rename(input, any_of(rename_map))
+  # Filter by Q-values
+  input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)
+  
+  if (MBR) {
+    input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
+  } else {
+    input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
+  }
   
   # Final column selection for MSstats format
   msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", 
diff --git a/R/converters.R b/R/converters.R
index f66fa50..a96fead 100644
--- a/R/converters.R
+++ b/R/converters.R
@@ -159,7 +159,7 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
 #'
 #' @inheritParams MSstatsPreprocessBig
 #' @param MBR True if analysis was done with match between runs.
-#' @param quantificationColumn Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x.
+#' @param quantificationColumn Use 'Fragment.Quant.Corrected'(default) column for quantified intensities for DIANN 1.8.x.
 #' Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. 
 #'
 #' @export
diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R
index 358ff49..ead17f7 100644
--- a/tests/testthat/test-diann_converter.R
+++ b/tests/testthat/test-diann_converter.R
@@ -16,7 +16,7 @@ test_that("cleanDIANNChunk processes data correctly", {
     Q.Value = 0.005,
     Precursor.Mz = 400.5,
     Fragment.Info = "y7^1/1;b3-H2O^1/1", # One valid, one to be filtered
-    Lib.Q.Value = 0.01,
+    Lib.Q.Value = 0.001,
     Lib.PG.Q.Value = 0.001,
     stringsAsFactors = FALSE
   )
@@ -54,7 +54,7 @@ test_that("reduceBigDIANN processes a file correctly", {
     Q.Value = c(0.005, 0.006),
     Precursor.Mz = c(400.5, 500.5),
     Fragment.Info = c("y7^1/1;b3-H2O^1/1", "y5^1/2"),
-    Lib.Q.Value = c(0.01, 0.02),
+    Lib.Q.Value = c(0.001, 0.002),
     Lib.PG.Q.Value = c(0.001, 0.002),
     stringsAsFactors = FALSE
   )
@@ -81,10 +81,10 @@ test_that("bigDIANNtoMSstatsFormat works with arrow backend", {
 
   # 4 features for one protein. Feature selection should pick the top 2.
   diann_data <- rbind(
-    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
-    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
-    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01),
-    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.01, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.01, Lib.PG.Q.Value = 0.01)
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(1000, 1100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y1", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(500, 600), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y2", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(100, 100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y3", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001),
+    data.frame(Run = c("r1", "r2"), Protein.Names = "P1", Stripped.Sequence = "PEPTIDE", Modified.Sequence = "PEPTIDE", Precursor.Charge = 2, Fragment.Quant.Corrected = c(2000, 2100), Q.Value = 0.001, Precursor.Mz = 500, Fragment.Info = "y4", Lib.Q.Value = 0.001, Lib.PG.Q.Value = 0.001)
   )
   write.csv(diann_data, input_file, row.names = FALSE)
 

From d8820919f4be8e3b2286c50faabc6137f0d7c11d Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 20 Jan 2026 00:01:51 -0600
Subject: [PATCH 3/8] Adding handling for DIANN 2.0

---
 R/clean_DIANN.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index 260fc6b..e8f8457 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -31,6 +31,15 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
                            global_qvalue_cutoff = 0.01,
                            qvalue_cutoff = 0.01,
                            pg_qvalue_cutoff = 0.01) {
+  # 0. Handle parquet files if needed
+  if (quantificationColumn == "auto") {
+    fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE)
+    if (length(fragment_columns) == 0) {
+      stop("No fragment quantification columns found. Please check your input.")
+    }
+    input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";")
+    quantificationColumn <- "FragmentQuantCorrected"
+  }
   
   # 1. Select required columns
   base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 

From 6728b7c384572d1b37435ec804628fd68d28b2a7 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 20 Jan 2026 00:02:22 -0600
Subject: [PATCH 4/8] Adding testes for DIANN 2.0

---
 tests/testthat/test-diann_converter.R | 38 +++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R
index ead17f7..f4f0c98 100644
--- a/tests/testthat/test-diann_converter.R
+++ b/tests/testthat/test-diann_converter.R
@@ -39,6 +39,44 @@ test_that("cleanDIANNChunk processes data correctly", {
   file.remove(output_file)
 })
 
+test_that("cleanDIANNChunk handles 'auto' quantification column correctly", {
+  output_file <- tempfile(fileext = ".csv")
+
+  # Data with wide format fragment quantification
+  diann_chunk_wide <- data.frame(
+    Run = "run1",
+    Protein.Names = "ProteinA",
+    Stripped.Sequence = "PEPTIDE",
+    Modified.Sequence = "PEPTIDE",
+    Precursor.Charge = 2,
+    Fr1Quantity = 100,
+    Fr2Quantity = 200,
+    Q.Value = 0.005,
+    Precursor.Mz = 400.5,
+    Fragment.Info = "y1^1/1;y2^1/1",
+    Lib.Q.Value = 0.001,
+    Lib.PG.Q.Value = 0.001,
+    stringsAsFactors = FALSE
+  )
+
+  MSstatsBig:::cleanDIANNChunk(diann_chunk_wide, output_file, MBR = TRUE,
+                               quantificationColumn = "auto", pos = 1)
+
+  result <- read.csv(output_file)
+
+  expect_equal(nrow(result), 2)
+  expect_equal(sort(result$Intensity), c(100, 200))
+  expect_equal(sort(result$FragmentIon), c("y1^1/1", "y2^1/1"))
+
+  file.remove(output_file)
+
+  # Test error when columns are missing
+  diann_chunk_missing <- diann_chunk_wide[, !grepl("Quantity", names(diann_chunk_wide))]
+  expect_error(MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE,
+                                            quantificationColumn = "auto", pos = 1),
+               "No fragment quantification columns found")
+})
+
 # Test for the internal reduceBigDIANN function
 test_that("reduceBigDIANN processes a file correctly", {
   input_file <- tempfile(fileext = ".csv")

From a041a72edca8e3d99ab9f88e978cb7fbb0e81ee1 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 20 Jan 2026 00:07:46 -0600
Subject: [PATCH 5/8] Making the code more readable, and easier to follow

---
 R/clean_DIANN.R | 112 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 83 insertions(+), 29 deletions(-)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index e8f8457..d03ae2a 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -31,7 +31,40 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
                            global_qvalue_cutoff = 0.01,
                            qvalue_cutoff = 0.01,
                            pg_qvalue_cutoff = 0.01) {
-  # 0. Handle parquet files if needed
+  # 1. Handle "auto" quantification column
+  processed <- .handleAutoQuantification(input, quantificationColumn)
+  input <- processed$input
+  quantificationColumn <- processed$quantificationColumn
+  
+  # 2. Select required columns
+  input <- .selectDIANNColumns(input, MBR, quantificationColumn)
+  
+  # 3. Expand concatenated rows
+  input <- .expandDIANNRows(input, quantificationColumn)
+  
+  # 4. Process fragment info (extract intensity, charge, ion)
+  input <- .processDIANNFragmentInfo(input, quantificationColumn)
+  
+  # 5. Filter invalid fragments
+  input <- .filterDIANNFragments(input, quantificationColumn)
+  
+  # 6. Standardize column names
+  input <- .standardizeDIANNColumns(input, quantificationColumn)
+  
+  # 7. Filter by Q-values
+  input <- .filterDIANNByQValues(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff)
+  
+  # 8. Finalize columns (select final set, add IsotopeLabelType)
+  input <- .finalizeDIANNColumns(input)
+  
+  # 9. Write to output
+  .writeDIANNChunk(input, output_path, pos)
+  
+  NULL
+}
+
+#' @keywords internal
+.handleAutoQuantification <- function(input, quantificationColumn) {
   if (quantificationColumn == "auto") {
     fragment_columns <- grep("^Fr[0-9]+Quantity$", colnames(input), value = TRUE)
     if (length(fragment_columns) == 0) {
@@ -40,8 +73,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
     input <- tidyr::unite(input, "FragmentQuantCorrected", all_of(fragment_columns), sep = ";")
     quantificationColumn <- "FragmentQuantCorrected"
   }
-  
-  # 1. Select required columns
+  list(input = input, quantificationColumn = quantificationColumn)
+}
+
+#' @keywords internal
+.selectDIANNColumns <- function(input, MBR, quantificationColumn) {
   base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 
                  'Precursor.Charge', quantificationColumn, 'Q.Value', 
                  'Precursor.Mz', 'Fragment.Info', 'Run')
@@ -53,23 +89,28 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   }
   
   req_cols <- intersect(c(base_cols, mbr_cols), colnames(input))
-  input <- dplyr::select(input, all_of(req_cols))
-  
-  # 2. Split concatenated values (un-nest)
+  dplyr::select(input, all_of(req_cols))
+}
+
+#' @keywords internal
+.expandDIANNRows <- function(input, quantificationColumn) {
   split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
   if (length(split_cols) > 0) {
-    input <- tidyr::separate_rows(input, all_of(split_cols), sep = ";")
+    tidyr::separate_rows(input, all_of(split_cols), sep = ";")
+  } else {
+    input
   }
-  
-  # 3. Process fragment information
+}
 
-  #Convert Intensity to Numeric from Char strings
+#' @keywords internal
+.processDIANNFragmentInfo <- function(input, quantificationColumn) {
+  # Convert Intensity to Numeric from Char strings
   input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])
   
-  input <- dplyr::mutate(
+  dplyr::mutate(
     input,
     FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
-
+    
     # Extract product charge
     ProductCharge = dplyr::if_else(
       grepl("/", .data$Fragment.Info),
@@ -78,19 +119,24 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
       1L
     )
   )
-  
-  # 4. Clean and filter data
-  input <- dplyr::filter(
+}
+
+#' @keywords internal
+.filterDIANNFragments <- function(input, quantificationColumn) {
+  dplyr::filter(
     input,
     !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
   )
-  
-  # 5. Rename columns to MSstats standard
+}
+
+#' @keywords internal
+.standardizeDIANNColumns <- function(input, quantificationColumn) {
   input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))
   
   # Standardize column names
+  clean_quant_col <- gsub("\\.", "", quantificationColumn)
   old_names <- c('ProteinNames', 'StrippedSequence', 'ModifiedSequence',
-                 'PrecursorCharge', gsub("\\.", "", quantificationColumn), 'QValue',
+                 'PrecursorCharge', clean_quant_col, 'QValue',
                  'PrecursorMz', 'FragmentIon', 'Run', 'ProductCharge')
   new_names <- c('ProteinName', 'PeptideSequence', 'PeptideModifiedSequence',
                  'PrecursorCharge', 'Intensity', 'DetectionQValue', 
@@ -98,39 +144,48 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   
   current_names <- colnames(input)
   names_to_rename <- intersect(current_names, old_names)
-
+  
   # Create a named vector for renaming in the format c(new_name = old_name)
   new_names_subset <- new_names[match(names_to_rename, old_names)]
   rename_map <- setNames(names_to_rename, new_names_subset)
   rename_map <- rename_map[!is.na(names(rename_map))]
+  
+  dplyr::rename(input, any_of(rename_map))
+}
 
-  input <- dplyr::rename(input, any_of(rename_map))
-  # Filter by Q-values
+#' @keywords internal
+.filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) {
   input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)
   
   if (MBR) {
-    input <- dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
+    dplyr::filter(input, LibPGQValue < pg_qvalue_cutoff & LibQValue < qvalue_cutoff)
   } else {
-    input <- dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
+    dplyr::filter(input, GlobalPGQValue < pg_qvalue_cutoff & GlobalQValue < qvalue_cutoff)
   }
-  
+}
+
+#' @keywords internal
+.finalizeDIANNColumns <- function(input) {
   # Final column selection for MSstats format
   msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", 
                     "FragmentIon", "ProductCharge", "Run", "Intensity")
   
-  #TODO: confirm with Tony -- are these three needed?
+  # TODO: confirm with Tony -- are these three needed?
   
   # Add annotation columns if they exist
   if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
   if ("BioReplicate" %in% colnames(input)) msstats_cols <- c(msstats_cols, "BioReplicate")
-   
+  
   # Add IsotopeLabelType, assuming Light for DIANN
   input$IsotopeLabelType <- "L"
   msstats_cols <- c(msstats_cols, "IsotopeLabelType")
   
   final_cols <- intersect(msstats_cols, colnames(input))
-  input <- dplyr::select(input, all_of(final_cols))
-  
+  dplyr::select(input, all_of(final_cols))
+}
+
+#' @keywords internal
+.writeDIANNChunk <- function(input, output_path, pos) {
   # Write to file
   if (!is.null(pos)) {
     if (pos == 1) {
@@ -139,5 +194,4 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
       readr::write_csv(input, file = output_path, append = TRUE)
     }
   }
-  NULL
 }
\ No newline at end of file

From 43e68b2ea130bd2fa2153be6313c08c904de5ca9 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 20 Jan 2026 01:13:22 -0600
Subject: [PATCH 6/8] Adding missing columns code and a test to make sure that
 when flagment.ion is NA, ProductCharge is set to 1

---
 R/clean_DIANN.R | 92 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 3 deletions(-)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index d03ae2a..5d85389 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -1,3 +1,13 @@
+#' Read and clean a large DIANN file in chunks
+#' 
+#' @param input_file Path to the input DIANN file
+#' @param output_path Path to the output CSV file
+#' @param MBR Boolean, whether MBR was used
+#' @param quantificationColumn Name of the column containing intensity values
+#' @param global_qvalue_cutoff Global Q-value cutoff
+#' @param qvalue_cutoff Q-value cutoff
+#' @param pg_qvalue_cutoff Protein group Q-value cutoff
+#' @return NULL. Writes to file.
 #' @keywords internal
 reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
                            quantificationColumn = "FragmentQuantCorrected",
@@ -26,6 +36,17 @@ reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
                             chunk_size = 1e6)
 }
 
+#' Clean a single chunk of DIANN data
+#' 
+#' @param input Data frame chunk
+#' @param output_path Path to output file
+#' @param MBR Boolean, whether MBR was used
+#' @param quantificationColumn Name of intensity column
+#' @param pos Chunk position (1 for first chunk, >1 for subsequent)
+#' @param global_qvalue_cutoff Global Q-value cutoff
+#' @param qvalue_cutoff Q-value cutoff
+#' @param pg_qvalue_cutoff Protein group Q-value cutoff
+#' @return NULL
 #' @keywords internal
 cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
                            global_qvalue_cutoff = 0.01,
@@ -38,6 +59,7 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   
   # 2. Select required columns
   input <- .selectDIANNColumns(input, MBR, quantificationColumn)
+  input <- .cleanDIANNAddMissingColumns(input)
   
   # 3. Expand concatenated rows
   input <- .expandDIANNRows(input, quantificationColumn)
@@ -63,6 +85,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   NULL
 }
 
+#' Handle automatic detection of quantification columns
+#' 
+#' @param input Data frame
+#' @param quantificationColumn Name of column or "auto"
+#' @return List with input data frame and updated quantification column name
 #' @keywords internal
 .handleAutoQuantification <- function(input, quantificationColumn) {
   if (quantificationColumn == "auto") {
@@ -76,6 +103,12 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   list(input = input, quantificationColumn = quantificationColumn)
 }
 
+#' Select required columns from DIANN output
+#' 
+#' @param input Data frame
+#' @param MBR Boolean
+#' @param quantificationColumn Name of intensity column
+#' @return Data frame with selected columns
 #' @keywords internal
 .selectDIANNColumns <- function(input, MBR, quantificationColumn) {
   base_cols <- c('Protein.Names', 'Stripped.Sequence', 'Modified.Sequence', 
@@ -92,6 +125,26 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   dplyr::select(input, all_of(req_cols))
 }
 
+#' Add missing required columns
+#' 
+#' @param input Data frame
+#' @return Data frame with missing columns added
+#' @keywords internal
+.cleanDIANNAddMissingColumns <- function(input) {
+  if (!"Precursor.Mz" %in% colnames(input)) {
+    input <- dplyr::mutate(input, Precursor.Mz = NA)
+  }
+  if (!"Fragment.Info" %in% colnames(input)) {
+    input <- dplyr::mutate(input, Fragment.Info = NA)
+  }
+  input
+}
+
+#' Expand rows with multiple fragments
+#' 
+#' @param input Data frame
+#' @param quantificationColumn Name of intensity column
+#' @return Data frame with expanded rows
 #' @keywords internal
 .expandDIANNRows <- function(input, quantificationColumn) {
   split_cols <- intersect(c(quantificationColumn, "Fragment.Info"), colnames(input))
@@ -102,6 +155,11 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   }
 }
 
+#' Process fragment information strings
+#' 
+#' @param input Data frame
+#' @param quantificationColumn Name of intensity column
+#' @return Data frame with FragmentIon and ProductCharge columns
 #' @keywords internal
 .processDIANNFragmentInfo <- function(input, quantificationColumn) {
   # Convert Intensity to Numeric from Char strings
@@ -116,19 +174,30 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
       grepl("/", .data$Fragment.Info),
       # Extract charge (number right after "/" in string), default to 1 if parsing fails
       as.integer(stringr::str_extract(.data$Fragment.Info, "(?<=/)[0-9]+")),
-      1L
+      1L,
+      missing = 1L
     )
   )
 }
 
+#' Filter invalid fragments
+#' 
+#' @param input Data frame
+#' @param quantificationColumn Name of intensity column
+#' @return Filtered data frame
 #' @keywords internal
 .filterDIANNFragments <- function(input, quantificationColumn) {
   dplyr::filter(
     input,
-    !grepl("NH3|H2O", .data$FragmentIon) & !is.na(.data[[quantificationColumn]])
+    (!grepl("NH3|H2O", .data$FragmentIon) | is.na(.data$FragmentIon)) & !is.na(.data[[quantificationColumn]])
   )
 }
 
+#' Standardize column names to MSstats format
+#' 
+#' @param input Data frame
+#' @param quantificationColumn Name of intensity column
+#' @return Data frame with renamed columns
 #' @keywords internal
 .standardizeDIANNColumns <- function(input, quantificationColumn) {
   input <- dplyr::rename_with(input, .fn = function(x) gsub("\\.", "", x))
@@ -153,6 +222,14 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   dplyr::rename(input, any_of(rename_map))
 }
 
+#' Filter data by Q-values
+#' 
+#' @param input Data frame
+#' @param MBR Boolean
+#' @param global_qvalue_cutoff Numeric
+#' @param qvalue_cutoff Numeric
+#' @param pg_qvalue_cutoff Numeric
+#' @return Filtered data frame
 #' @keywords internal
 .filterDIANNByQValues <- function(input, MBR, global_qvalue_cutoff, qvalue_cutoff, pg_qvalue_cutoff) {
   input <- dplyr::filter(input, DetectionQValue < global_qvalue_cutoff)
@@ -164,13 +241,16 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   }
 }
 
+#' Finalize columns for output
+#' 
+#' @param input Data frame
+#' @return Data frame with final columns
 #' @keywords internal
 .finalizeDIANNColumns <- function(input) {
   # Final column selection for MSstats format
   msstats_cols <- c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", "PrecursorCharge", 
                     "FragmentIon", "ProductCharge", "Run", "Intensity")
   
-  # TODO: confirm with Tony -- are these three needed?
   
   # Add annotation columns if they exist
   if ("Condition" %in% colnames(input)) msstats_cols <- c(msstats_cols, "Condition")
@@ -184,6 +264,12 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   dplyr::select(input, all_of(final_cols))
 }
 
+#' Write chunk to file
+#' 
+#' @param input Data frame
+#' @param output_path Path to output file
+#' @param pos Chunk position
+#' @return NULL
 #' @keywords internal
 .writeDIANNChunk <- function(input, output_path, pos) {
   # Write to file

From d3772c9592943aaaf45e5355ea9429c710ea8304 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 20 Jan 2026 01:14:18 -0600
Subject: [PATCH 7/8] Test to make sure product charge is set to one when
 fragment.ion column is not present

---
 tests/testthat/test-diann_converter.R | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R
index f4f0c98..c434cb5 100644
--- a/tests/testthat/test-diann_converter.R
+++ b/tests/testthat/test-diann_converter.R
@@ -77,6 +77,37 @@ test_that("cleanDIANNChunk handles 'auto' quantification column correctly", {
                "No fragment quantification columns found")
 })
 
+test_that("cleanDIANNChunk handles missing Fragment.Info by defaulting ProductCharge to 1", {
+  output_file <- tempfile(fileext = ".csv")
+
+  # Data with missing Fragment.Info (simulating it not being present)
+  diann_chunk_missing <- data.frame(
+    Run = "run1",
+    Protein.Names = "ProteinA",
+    Stripped.Sequence = "PEPTIDE",
+    Modified.Sequence = "PEPTIDE",
+    Precursor.Charge = 2,
+    Fragment.Quant.Corrected = 100,
+    Q.Value = 0.005,
+    Precursor.Mz = 400.5,
+    # Fragment.Info is missing
+    Lib.Q.Value = 0.001,
+    Lib.PG.Q.Value = 0.001,
+    stringsAsFactors = FALSE
+  )
+
+  MSstatsBig:::cleanDIANNChunk(diann_chunk_missing, output_file, MBR = TRUE,
+                               quantificationColumn = "Fragment.Quant.Corrected", pos = 1)
+
+  result <- read.csv(output_file)
+
+  expect_equal(nrow(result), 1)
+  expect_equal(result$ProductCharge, 1)
+  expect_true(is.na(result$FragmentIon))
+
+  file.remove(output_file)
+})
+
 # Test for the internal reduceBigDIANN function
 test_that("reduceBigDIANN processes a file correctly", {
   input_file <- tempfile(fileext = ".csv")

From bfbcf7ce99c72da9500d109ec1a4e3b8755fef60 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 30 Jan 2026 11:46:33 -0600
Subject: [PATCH 8/8] Addressing comments

---
 R/clean_DIANN.R                       | 9 ++++++++-
 tests/testthat/test-diann_converter.R | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/R/clean_DIANN.R b/R/clean_DIANN.R
index 5d85389..6bd867d 100644
--- a/R/clean_DIANN.R
+++ b/R/clean_DIANN.R
@@ -10,7 +10,7 @@
 #' @return NULL. Writes to file.
 #' @keywords internal
 reduceBigDIANN <- function(input_file, output_path, MBR = TRUE,
-                           quantificationColumn = "FragmentQuantCorrected",
+                           quantificationColumn = "Fragment.Quant.Corrected",
                            global_qvalue_cutoff = 0.01,
                            qvalue_cutoff = 0.01,
                            pg_qvalue_cutoff = 0.01) {
@@ -165,6 +165,13 @@ cleanDIANNChunk = function(input, output_path, MBR, quantificationColumn, pos,
   # Convert Intensity to Numeric from Char strings
   input[[quantificationColumn]] <- as.numeric(input[[quantificationColumn]])
   
+  # Generate fragment info if missing
+  if (all(is.na(input$Fragment.Info))) {
+    input <- dplyr::group_by(input, Protein.Names, Modified.Sequence, Precursor.Charge, Run)
+    input <- dplyr::mutate(input, Fragment.Info = paste0("Frag", dplyr::row_number()))
+    input <- dplyr::ungroup(input)
+  }
+  
   dplyr::mutate(
     input,
     FragmentIon = sub('\\^\\.\\*', '', .data$Fragment.Info),
diff --git a/tests/testthat/test-diann_converter.R b/tests/testthat/test-diann_converter.R
index c434cb5..81b82cf 100644
--- a/tests/testthat/test-diann_converter.R
+++ b/tests/testthat/test-diann_converter.R
@@ -103,7 +103,7 @@ test_that("cleanDIANNChunk handles missing Fragment.Info by defaulting ProductCh
 
   expect_equal(nrow(result), 1)
   expect_equal(result$ProductCharge, 1)
-  expect_true(is.na(result$FragmentIon))
+  expect_equal(result$FragmentIon, "Frag1")
 
   file.remove(output_file)
 })
@@ -146,7 +146,7 @@ test_that("reduceBigDIANN processes a file correctly", {
 # End-to-end test for bigDIANNtoMSstatsFormat
 test_that("bigDIANNtoMSstatsFormat works with arrow backend", {
   input_file <- tempfile(fileext = ".csv")
-  output_file <- "test_diann_output.csv"
+  output_file <- basename(tempfile(fileext = ".csv"))
 
   # 4 features for one protein. Feature selection should pick the top 2.
   diann_data <- rbind(