From b99bb785100a72e62f092ab18d1e4afed9adf1e4 Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Thu, 9 Apr 2026 15:59:49 -0400 Subject: [PATCH 1/5] fix(fractions): Upgrade code to be more efficient for fraction selection --- R/utils_fractions.R | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/R/utils_fractions.R b/R/utils_fractions.R index 7bddb871..33011942 100644 --- a/R/utils_fractions.R +++ b/R/utils_fractions.R @@ -267,11 +267,41 @@ fraction_keep = Fraction = NULL if (data.table::uniqueN(input$Fraction) > 1) { - input[, fraction_keep := .getCorrectFraction(.SD), - by = "feature", - .SDcols = c("feature", "Fraction", "Run", "Intensity")] - input = input[Fraction == fraction_keep] - input = input[, !(colnames(input) == "fraction_keep"), with = FALSE] + # Step 1: count unique Runs per feature+Fraction + measurement_count = input[ + !is.na(Intensity) & Intensity > 0, + .(n_obs = uniqueN(Run)), + by = .(feature, Fraction) + ] + + # Step 2: keep only the Fraction(s) with max n_obs per feature + measurement_count[, is_max := n_obs == max(n_obs), by = "feature"] + max_fractions = measurement_count[(is_max)] + + # Step 3: if tie, resolve by mean intensity + tie_features = max_fractions[, .(n_ties = .N), by = "feature"][n_ties > 1, feature] + + if (length(tie_features) > 0) { + avg_abundance = input[ + feature %in% tie_features & !is.na(Intensity) & Intensity > 0, + .(mean_abundance = mean(Intensity)), + by = .(feature, Fraction) + ] + # Pick the Fraction with highest mean per tied feature + best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] + + # For non-tied features, just take the max fraction + best_simple = max_fractions[!feature %in% tie_features, + .(feature, Fraction = Fraction[1]), + by = "feature"] + fraction_map = rbind(best_simple[, .(feature, Fraction)], + best_tied[, .(feature, Fraction)]) + } else { + fraction_map = max_fractions[, .(feature, Fraction = Fraction[1]), by = "feature"] + } + + # Step 4: single join back to original table + input[fraction_map, fraction_keep := i.Fraction, on = "feature"] } input } From a248ff533a8c9cd565b75b5593b97619a14c2fa9 Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Thu, 9 Apr 2026 16:20:25 -0400 Subject: [PATCH 2/5] keep fractions that match --- R/utils_fractions.R | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/R/utils_fractions.R b/R/utils_fractions.R index 33011942..0e27d5dd 100644 --- a/R/utils_fractions.R +++ b/R/utils_fractions.R @@ -287,21 +287,20 @@ .(mean_abundance = mean(Intensity)), by = .(feature, Fraction) ] - # Pick the Fraction with highest mean per tied feature best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] - - # For non-tied features, just take the max fraction - best_simple = max_fractions[!feature %in% tie_features, - .(feature, Fraction = Fraction[1]), - by = "feature"] - fraction_map = rbind(best_simple[, .(feature, Fraction)], + best_simple = max_fractions[ + !feature %in% tie_features, + .(Fraction = Fraction[1]), + by = "feature" + ] + fraction_map = rbind(best_simple[, .(feature, Fraction)], best_tied[, .(feature, Fraction)]) } else { - fraction_map = max_fractions[, .(feature, Fraction = Fraction[1]), by = "feature"] + fraction_map = max_fractions[, .(Fraction = Fraction[1]), by = "feature"] } - # Step 4: single join back to original table - input[fraction_map, fraction_keep := i.Fraction, on = "feature"] + # Step 4: filter input to only rows matching the selected fraction per feature + input = input[fraction_map, on = .(feature, Fraction), nomatch = 0] } input } From 8685ba3b925a4981cc5e84fffa1026eaaa0d0c44 Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Thu, 9 Apr 2026 16:31:00 -0400 Subject: [PATCH 3/5] update code to be more readable --- R/utils_fractions.R | 70 +++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/R/utils_fractions.R b/R/utils_fractions.R index 0e27d5dd..fc8cf9f1 100644 --- a/R/utils_fractions.R +++ b/R/utils_fractions.R @@ -264,70 +264,52 @@ #' @return data.table #' @keywords internal .removeOverlappingFeatures = function(input) { - fraction_keep = Fraction = NULL + Fraction = NULL if (data.table::uniqueN(input$Fraction) > 1) { - # Step 1: count unique Runs per feature+Fraction measurement_count = input[ !is.na(Intensity) & Intensity > 0, .(n_obs = uniqueN(Run)), by = .(feature, Fraction) ] - - # Step 2: keep only the Fraction(s) with max n_obs per feature measurement_count[, is_max := n_obs == max(n_obs), by = "feature"] max_fractions = measurement_count[(is_max)] - # Step 3: if tie, resolve by mean intensity - tie_features = max_fractions[, .(n_ties = .N), by = "feature"][n_ties > 1, feature] - - if (length(tie_features) > 0) { - avg_abundance = input[ - feature %in% tie_features & !is.na(Intensity) & Intensity > 0, - .(mean_abundance = mean(Intensity)), - by = .(feature, Fraction) - ] - best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] - best_simple = max_fractions[ - !feature %in% tie_features, - .(Fraction = Fraction[1]), - by = "feature" - ] - fraction_map = rbind(best_simple[, .(feature, Fraction)], - best_tied[, .(feature, Fraction)]) - } else { - fraction_map = max_fractions[, .(Fraction = Fraction[1]), by = "feature"] - } - - # Step 4: filter input to only rows matching the selected fraction per feature + fraction_map = .resolveFractionTies(input, max_fractions) input = input[fraction_map, on = .(feature, Fraction), nomatch = 0] } input } -#' Get a name of fraction with the largest number of measurements or the largest -#' average intensity +#' Resolve ties when multiple fractions share the maximum number of measurements +#' for a given feature. In the case of a tie, the fraction with the highest +#' mean intensity is selected. #' @param input output of `MSstatsPreprocess` -#' @return character - label of the fraction that has most measurements or -#' highest mean intensity for a given feature +#' @param max_fractions data.table of fractions that share the maximum number +#' of unique runs per feature, as produced by `.removeOverlappingFeatures` +#' @return data.table with columns `feature` and `Fraction`, containing one +#' selected fraction per feature #' @keywords internal -.getCorrectFraction = function(input) { - Intensity = Run = Fraction = NULL +.resolveFractionTies = function(input, max_fractions) { + tie_features = max_fractions[, .(n_ties = .N), by = "feature"][n_ties > 1, feature] - measurement_count = input[!is.na(Intensity) & Intensity > 0, - list(n_obs = data.table::uniqueN(Run)), - by = "Fraction"] - which_max_measurements = which(measurement_count$n_obs == max(measurement_count$n_obs)) - if (length(which_max_measurements) == 1L) { - return(unique(measurement_count$Fraction[which_max_measurements])) + if (length(tie_features) > 0) { + avg_abundance = input[ + feature %in% tie_features & !is.na(Intensity) & Intensity > 0, + .(mean_abundance = mean(Intensity)), + by = .(feature, Fraction) + ] + best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] + best_simple = max_fractions[ + !feature %in% tie_features, + .(Fraction = Fraction[1]), + by = "feature" + ] + rbind(best_simple[, .(feature, Fraction)], + best_tied[, .(feature, Fraction)]) } else { - input = input[Fraction %in% measurement_count$Fraction[which_max_measurements]] - average_abundance = input[!is.na(Intensity) & Intensity > 0, - list(mean_abundance = mean(Intensity)), - by = "Fraction"] - which_max_abundance = which.max(average_abundance$mean_abundance) - unique(average_abundance$Fraction[which_max_abundance]) + max_fractions[, .(Fraction = Fraction[1]), by = "feature"] } } From 8aa5ec54b7bf62a51da40560962b882fc3987fe9 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 10 Apr 2026 09:08:30 -0400 Subject: [PATCH 4/5] fix unit tests --- inst/tinytest/test_fractions.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/inst/tinytest/test_fractions.R b/inst/tinytest/test_fractions.R index c2a89bb5..292d1213 100644 --- a/inst/tinytest/test_fractions.R +++ b/inst/tinytest/test_fractions.R @@ -67,12 +67,16 @@ fractionated = data.table::data.table( Run = 1:12, Intensity = c(NA, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2) ) -picked_A = MSstatsConvert:::.getCorrectFraction(fractionated[feature == "A"]) -picked_B = MSstatsConvert:::.getCorrectFraction(fractionated[feature == "B"]) ### More observations win -expect_equal(picked_A, 2) -### Higher average intensity wins -expect_equal(picked_B, 2) +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated[feature == "A"])$Fraction), + 2 +) +### Higher average intensity wins on tie +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated[feature == "B"])$Fraction), + 2 +) ### For full data expect_identical( MSstatsConvert:::.removeOverlappingFeatures(data.table::copy(fractionated)), From 9c3990d069dac2eefd82ff9a8838ffc432d59b1e Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 10 Apr 2026 09:47:38 -0400 Subject: [PATCH 5/5] fix edge case --- R/utils_fractions.R | 9 +++++---- inst/tinytest/test_fractions.R | 13 +++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/R/utils_fractions.R b/R/utils_fractions.R index fc8cf9f1..f4d48e78 100644 --- a/R/utils_fractions.R +++ b/R/utils_fractions.R @@ -295,11 +295,12 @@ tie_features = max_fractions[, .(n_ties = .N), by = "feature"][n_ties > 1, feature] if (length(tie_features) > 0) { + tied_fractions = max_fractions[feature %in% tie_features, .(feature, Fraction)] avg_abundance = input[ - feature %in% tie_features & !is.na(Intensity) & Intensity > 0, - .(mean_abundance = mean(Intensity)), - by = .(feature, Fraction) - ] + tied_fractions, on = .(feature, Fraction), nomatch = 0 + ][!is.na(Intensity) & Intensity > 0, + .(mean_abundance = mean(Intensity)), + by = .(feature, Fraction)] best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] best_simple = max_fractions[ !feature %in% tie_features, diff --git a/inst/tinytest/test_fractions.R b/inst/tinytest/test_fractions.R index 292d1213..d07d8427 100644 --- a/inst/tinytest/test_fractions.R +++ b/inst/tinytest/test_fractions.R @@ -82,6 +82,19 @@ expect_identical( MSstatsConvert:::.removeOverlappingFeatures(data.table::copy(fractionated)), fractionated[Fraction == 2] ) +### Non-tied fraction with high mean intensity should not be selected over tied fractions +fractionated_third = data.table::data.table( + feature = rep("A", 9), + Fraction = c(rep(1, 3), rep(2, 3), rep(3, 3)), + Run = 1:9, + Intensity = c(1, 1, 1, # Fraction 1: 3 obs, mean = 1 (ties for max n_obs) + 2, 2, 2, # Fraction 2: 3 obs, mean = 2 (ties for max n_obs) + 10, NA, NA) # Fraction 3: 1 obs, mean = 10 (loses on n_obs but mean would win w/o fix) +) +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated_third)$Fraction), + 2 +) fractionated_tmt = fractionated = data.table::data.table( feature = rep(c("A", "B"), each = 6), Fraction = rep(rep(c(1, 2), each = 3), times = 2),