diff --git a/R/utils_fractions.R b/R/utils_fractions.R index 7bddb871..f4d48e78 100644 --- a/R/utils_fractions.R +++ b/R/utils_fractions.R @@ -264,41 +264,53 @@ #' @return data.table #' @keywords internal .removeOverlappingFeatures = function(input) { - fraction_keep = Fraction = NULL + Fraction = NULL if (data.table::uniqueN(input$Fraction) > 1) { - input[, fraction_keep := .getCorrectFraction(.SD), - by = "feature", - .SDcols = c("feature", "Fraction", "Run", "Intensity")] - input = input[Fraction == fraction_keep] - input = input[, !(colnames(input) == "fraction_keep"), with = FALSE] + measurement_count = input[ + !is.na(Intensity) & Intensity > 0, + .(n_obs = uniqueN(Run)), + by = .(feature, Fraction) + ] + measurement_count[, is_max := n_obs == max(n_obs), by = "feature"] + max_fractions = measurement_count[(is_max)] + + fraction_map = .resolveFractionTies(input, max_fractions) + input = input[fraction_map, on = .(feature, Fraction), nomatch = 0] } input } -#' Get a name of fraction with the largest number of measurements or the largest -#' average intensity +#' Resolve ties when multiple fractions share the maximum number of measurements +#' for a given feature. In the case of a tie, the fraction with the highest +#' mean intensity is selected. #' @param input output of `MSstatsPreprocess` -#' @return character - label of the fraction that has most measurements or -#' highest mean intensity for a given feature +#' @param max_fractions data.table of fractions that share the maximum number +#' of unique runs per feature, as produced by `.removeOverlappingFeatures` +#' @return data.table with columns `feature` and `Fraction`, containing one +#' selected fraction per feature #' @keywords internal -.getCorrectFraction = function(input) { - Intensity = Run = Fraction = NULL +.resolveFractionTies = function(input, max_fractions) { + tie_features = max_fractions[, .(n_ties = .N), by = "feature"][n_ties > 1, feature] - measurement_count = input[!is.na(Intensity) & Intensity > 0, - list(n_obs = data.table::uniqueN(Run)), - by = "Fraction"] - which_max_measurements = which(measurement_count$n_obs == max(measurement_count$n_obs)) - if (length(which_max_measurements) == 1L) { - return(unique(measurement_count$Fraction[which_max_measurements])) + if (length(tie_features) > 0) { + tied_fractions = max_fractions[feature %in% tie_features, .(feature, Fraction)] + avg_abundance = input[ + tied_fractions, on = .(feature, Fraction), nomatch = 0 + ][!is.na(Intensity) & Intensity > 0, + .(mean_abundance = mean(Intensity)), + by = .(feature, Fraction)] + best_tied = avg_abundance[, .SD[which.max(mean_abundance)], by = "feature"] + best_simple = max_fractions[ + !feature %in% tie_features, + .(Fraction = Fraction[1]), + by = "feature" + ] + rbind(best_simple[, .(feature, Fraction)], + best_tied[, .(feature, Fraction)]) } else { - input = input[Fraction %in% measurement_count$Fraction[which_max_measurements]] - average_abundance = input[!is.na(Intensity) & Intensity > 0, - list(mean_abundance = mean(Intensity)), - by = "Fraction"] - which_max_abundance = which.max(average_abundance$mean_abundance) - unique(average_abundance$Fraction[which_max_abundance]) + max_fractions[, .(Fraction = Fraction[1]), by = "feature"] } } diff --git a/inst/tinytest/test_fractions.R b/inst/tinytest/test_fractions.R index c2a89bb5..d07d8427 100644 --- a/inst/tinytest/test_fractions.R +++ b/inst/tinytest/test_fractions.R @@ -67,17 +67,34 @@ fractionated = data.table::data.table( Run = 1:12, Intensity = c(NA, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2) ) -picked_A = MSstatsConvert:::.getCorrectFraction(fractionated[feature == "A"]) -picked_B = MSstatsConvert:::.getCorrectFraction(fractionated[feature == "B"]) ### More observations win -expect_equal(picked_A, 2) -### Higher average intensity wins -expect_equal(picked_B, 2) +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated[feature == "A"])$Fraction), + 2 +) +### Higher average intensity wins on tie +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated[feature == "B"])$Fraction), + 2 +) ### For full data expect_identical( MSstatsConvert:::.removeOverlappingFeatures(data.table::copy(fractionated)), fractionated[Fraction == 2] ) +### Non-tied fraction with high mean intensity should not be selected over tied fractions +fractionated_third = data.table::data.table( + feature = rep("A", 9), + Fraction = c(rep(1, 3), rep(2, 3), rep(3, 3)), + Run = 1:9, + Intensity = c(1, 1, 1, # Fraction 1: 3 obs, mean = 1 (ties for max n_obs) + 2, 2, 2, # Fraction 2: 3 obs, mean = 2 (ties for max n_obs) + 10, NA, NA) # Fraction 3: 1 obs, mean = 10 (loses on n_obs but mean would win w/o fix) +) +expect_equal( + unique(MSstatsConvert:::.removeOverlappingFeatures(fractionated_third)$Fraction), + 2 +) fractionated_tmt = fractionated = data.table::data.table( feature = rep(c("A", "B"), each = 6), Fraction = rep(rep(c(1, 2), each = 3), times = 2),