diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 63c59570..c5fd9215 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -769,6 +769,10 @@ pseudobulk_matrix_cpp <- function(mat, cell_groups, method, transpose) { .Call(`_BPCells_pseudobulk_matrix_cpp`, mat, cell_groups, method, transpose) } +pseudobulk_matrix_sparse_cpp <- function(mat, cell_groups, method, transpose) { + .Call(`_BPCells_pseudobulk_matrix_sparse_cpp`, mat, cell_groups, method, transpose) +} + matrix_quantile_per_col_cpp <- function(mat, quantile, alpha, beta) { .Call(`_BPCells_matrix_quantile_per_col_cpp`, mat, quantile, alpha, beta) } diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 449e5636..4206ff6c 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -1,5 +1,5 @@ # Copyright 2023 BPCells contributors -# +# # Licensed under the Apache License, Version 2.0 or the MIT license # , at your @@ -10,8 +10,8 @@ #' #' Given a features x cells matrix, perform one-vs-all differential #' tests to find markers. -#' -#' Tips for using the values from this function: +#' +#' Tips for using the values from this function: #' - Use `dplyr::mutate()` to add columns for e.g. adjusted p-value and log fold change. #' - Use `dplyr::filter()` to get only differential genes above some given threshold #' - To get adjusted p-values, use R `p.adjust()`, recommended method is "BH" @@ -21,9 +21,9 @@ #' #' @param mat IterableMatrix object of dimensions features x cells #' @param groups Character/factor vector of cell groups/clusters. Length #cells -#' @param method Test method to use. Current options are: +#' @param method Test method to use. Current options are: #' - `wilcoxon`: Wilconxon rank-sum test a.k.a Mann-Whitney U test -#' @return tibble with the following columns: +#' @return tibble with the following columns: #' - **foreground**: Group ID used for the foreground #' - **background**: Group ID used for the background (or NA if comparing to rest of cells) #' - **feature**: ID of the feature @@ -67,7 +67,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { foreground_means <- multiply_rows(group_sums, 1 / as.numeric(table(groups))) background_means <- add_cols(-group_sums, colSums(group_sums)) %>% multiply_rows(1 / (length(groups) - as.numeric(table(groups)))) - + feature_names <- if (is.null(rownames(mat))) seq_len(nrow(mat)) else rownames(mat) tibble::tibble( @@ -89,6 +89,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' #' Current options are: `nonzeros`, `sum`, `mean`, `variance`. #' @param threads (integer) Number of threads to use. +#' @param sparse (logical) Whether or not to return sparse matrices. #' @return #' - If `method` is length `1`, returns a matrix of shape `(features x groups)`. #' - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method. @@ -124,7 +125,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' pseudobulk_res_multi$mean #' @inheritParams marker_features #' @export -pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { +pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L, sparse = FALSE) { assert_is(mat, "IterableMatrix") assert_is(cell_groups, c("factor", "character", "numeric")) assert_true(length(cell_groups) == ncol(mat)) @@ -138,12 +139,21 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { } assert_is_wholenumber(threads) - it <- mat %>% - convert_matrix_type("double") %>% - parallel_split(threads, threads*4) %>% - iterate_matrix() - - res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) + if (sparse) { + new.order <- order(cell_groups) + cell_groups <- cell_groups[new.order] + it <- mat[, new.order] %>% + convert_matrix_type("double") %>% + parallel_split(threads, threads*4) %>% + iterate_matrix() + res <- pseudobulk_matrix_sparse_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) + } else { + it <- mat %>% + convert_matrix_type("double") %>% + parallel_split(threads, threads*4) %>% + iterate_matrix() + res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) + } # if res is a single matrix, return with colnames and rownames if (length(method) == 1) { colnames(res[[method]]) <- levels(cell_groups) @@ -161,4 +171,4 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { } } return(res) -} \ No newline at end of file +} diff --git a/r/man/pseudobulk_matrix.Rd b/r/man/pseudobulk_matrix.Rd index eff442c0..03ae3048 100644 --- a/r/man/pseudobulk_matrix.Rd +++ b/r/man/pseudobulk_matrix.Rd @@ -4,7 +4,13 @@ \alias{pseudobulk_matrix} \title{Aggregate counts matrices by cell group or feature.} \usage{ -pseudobulk_matrix(mat, cell_groups, method = "sum", threads = 0L) +pseudobulk_matrix( + mat, + cell_groups, + method = "sum", + threads = 0L, + sparse = FALSE +) } \arguments{ \item{mat}{IterableMatrix object of dimensions features x cells} @@ -16,6 +22,8 @@ pseudobulk_matrix(mat, cell_groups, method = "sum", threads = 0L) Current options are: \code{nonzeros}, \code{sum}, \code{mean}, \code{variance}.} \item{threads}{(integer) Number of threads to use.} + +\item{sparse}{(logical) Whether or not to return sparse matrices.} } \value{ \itemize{ diff --git a/r/src/Makevars.in b/r/src/Makevars.in index c816cc37..5bc09296 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -55,6 +55,7 @@ bpcells-cpp/matrixTransforms/Scale.o \ bpcells-cpp/matrixTransforms/Shift.o \ bpcells-cpp/matrixUtils/WilcoxonRankSum.o \ bpcells-cpp/matrixUtils/Pseudobulk.o \ +bpcells-cpp/matrixUtils/PseudobulkSparse.o \ bpcells-cpp/matrixUtils/Quantile.o \ bpcells-cpp/simd/bp128/current_target.o \ bpcells-cpp/simd/bp128/bp128-Nx128.o \ diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index 46170c90..8d0b31f3 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -2536,6 +2536,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// pseudobulk_matrix_sparse_cpp +List pseudobulk_matrix_sparse_cpp(SEXP mat, std::vector cell_groups, std::vector method, bool transpose); +RcppExport SEXP _BPCells_pseudobulk_matrix_sparse_cpp(SEXP matSEXP, SEXP cell_groupsSEXP, SEXP methodSEXP, SEXP transposeSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type mat(matSEXP); + Rcpp::traits::input_parameter< std::vector >::type cell_groups(cell_groupsSEXP); + Rcpp::traits::input_parameter< std::vector >::type method(methodSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + rcpp_result_gen = Rcpp::wrap(pseudobulk_matrix_sparse_cpp(mat, cell_groups, method, transpose)); + return rcpp_result_gen; +END_RCPP +} // matrix_quantile_per_col_cpp Eigen::ArrayXXd matrix_quantile_per_col_cpp(SEXP mat, std::vector quantile, double alpha, double beta); RcppExport SEXP _BPCells_matrix_quantile_per_col_cpp(SEXP matSEXP, SEXP quantileSEXP, SEXP alphaSEXP, SEXP betaSEXP) { @@ -2756,6 +2770,7 @@ static const R_CallMethodDef CallEntries[] = { {"_BPCells_matrix_max_per_row_cpp", (DL_FUNC) &_BPCells_matrix_max_per_row_cpp, 1}, {"_BPCells_matrix_max_per_col_cpp", (DL_FUNC) &_BPCells_matrix_max_per_col_cpp, 1}, {"_BPCells_pseudobulk_matrix_cpp", (DL_FUNC) &_BPCells_pseudobulk_matrix_cpp, 4}, + {"_BPCells_pseudobulk_matrix_sparse_cpp", (DL_FUNC) &_BPCells_pseudobulk_matrix_sparse_cpp, 4}, {"_BPCells_matrix_quantile_per_col_cpp", (DL_FUNC) &_BPCells_matrix_quantile_per_col_cpp, 4}, {"_BPCells_matrix_identical_uint32_t_cpp", (DL_FUNC) &_BPCells_matrix_identical_uint32_t_cpp, 2}, {NULL, NULL, 0} diff --git a/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.cpp b/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.cpp new file mode 100644 index 00000000..e91b0239 --- /dev/null +++ b/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.cpp @@ -0,0 +1,162 @@ + +#include "PseudobulkSparse.h" +namespace BPCells { + +template +PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr> &&mat, + const std::vector& cell_groups, + PseudobulkStatsMethod method, + bool transpose, + std::atomic *user_interrupt) { + MatrixIterator it(std::move(mat)); + if (transpose && (it.rows() != cell_groups.size())) { + throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must match the number of columns in the matrix"); + } else if (!transpose && (it.cols() != cell_groups.size())) { + throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must match the number of columns in the matrix"); + } + // Count each group + std::vector group_count_vec; + uint32_t last_cell = 0; + for (auto& cell_group : cell_groups) { + if (cell_group < last_cell) { + throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must be ordered"); + } + last_cell = cell_group; + if (cell_group >= group_count_vec.size()) group_count_vec.resize(cell_group + 1); + group_count_vec[cell_group]++; + } + Eigen::Map> group_count(group_count_vec.data(), group_count_vec.size()); + // If transposed, also keep matrix transposed with groups as rows for cache-friendliness + uint32_t num_rows = transpose ? it.cols() : it.rows(); + uint32_t num_cols = group_count_vec.size(); + uint32_t tmp_size; + if (transpose) { + tmp_size = num_cols; + } else { + tmp_size = num_rows; + } + struct PseudobulkStatsSparse res; + struct PseudobulkStatsTriplet trip; + struct PseudobulkStatsTemp tmp; + if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { + res.non_zeros = Eigen::SparseMatrix(num_rows, num_cols); + trip.non_zeros.reserve(num_rows * num_cols); + tmp.non_zeros = std::vector(tmp_size, 0.0); + } + if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { + res.sum = Eigen::SparseMatrix(num_rows, num_cols); + trip.sum.reserve(num_rows * num_cols); + tmp.sum = std::vector(tmp_size, 0.0); + } + if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { + res.mean = Eigen::SparseMatrix(num_rows, num_cols); + trip.mean.reserve(num_rows * num_cols); + tmp.non_zeros = std::vector(tmp_size, 0.0); + tmp.mean = std::vector(tmp_size, 0.0); + } + if ((method & PseudobulkStatsMethod::Variance)== PseudobulkStatsMethod::Variance) { + res.var = Eigen::SparseMatrix(num_rows, num_cols); + trip.var.reserve(num_rows * num_cols); + tmp.non_zeros = std::vector(tmp_size, 0.0); + tmp.mean = std::vector(tmp_size, 0.0); + tmp.var = std::vector(tmp_size, 0.0); + } + uint32_t ncells = 0; + uint32_t g_idx = 0; + // Use a one pass strategy that can do non-zero, mean, and variance at the same time + while (it.nextCol()) { + const uint32_t col = it.currentCol(); + if (user_interrupt != NULL && *user_interrupt) return res; + while (it.load()) { + const uint32_t *row_data = it.rowData(); + const T *val_data = it.valData(); + const uint32_t count = it.capacity(); + // Do transpose logic to calculate row_idx, col_idx out of loop, to avoid repeating conditional checks + if (transpose) { + // Doing variance calculation also does mean and non-zero count, so specific order is required to make sure we don't double count + if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) { + for (uint32_t i = 0; i < count; i++) update_mean_and_variance(val_data[i], tmp.non_zeros[cell_groups[row_data[i]]], tmp.mean[cell_groups[row_data[i]]], tmp.var[cell_groups[row_data[i]]]); + } else if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { + for (uint32_t i = 0; i < count; i++) update_mean(val_data[i], tmp.non_zeros[cell_groups[row_data[i]]], tmp.mean[cell_groups[row_data[i]]]); + } else if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { + for (uint32_t i = 0; i < count; i++) tmp.non_zeros[cell_groups[row_data[i]]] += 1; + } + if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { + for (uint32_t i = 0; i < count; i++) tmp.sum[cell_groups[row_data[i]]] += val_data[i]; + } + } else { + if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) { + for (uint32_t i = 0; i < count; i++) update_mean_and_variance(val_data[i], tmp.non_zeros[row_data[i]], tmp.mean[row_data[i]], tmp.var[row_data[i]]); + } else if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { + for (uint32_t i = 0; i < count; i++) update_mean(val_data[i], tmp.non_zeros[row_data[i]], tmp.mean[row_data[i]]); + } else if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { + for (uint32_t i = 0; i < count; i++) tmp.non_zeros[row_data[i]] += 1; + } + if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { + for (uint32_t i = 0; i < count; i++) tmp.sum[row_data[i]] += val_data[i]; + } + } + } + if (transpose) { + for (uint32_t i = 0; i < tmp.var.size(); i++) { + if (tmp.mean[i] > 0) trip.var.emplace_back(col, i, (tmp.var[i] + (tmp.mean[i] * tmp.mean[i] * tmp.non_zeros[i] * ((group_count_vec[i] - tmp.non_zeros[i]) / group_count_vec[i]))) / (group_count_vec[i] - 1)); + } + for (uint32_t i = 0; i < tmp.mean.size(); i++) { + if (tmp.mean[i] > 0) trip.mean.emplace_back(col, i, tmp.mean[i] * tmp.non_zeros[i] / group_count_vec[i]); + } + for (uint32_t i = 0; i < tmp.non_zeros.size(); i++) { + if (tmp.non_zeros[i] > 0) trip.non_zeros.emplace_back(col, i, tmp.non_zeros[i]); + } + for (uint32_t i = 0; i < tmp.sum.size(); i++) { + if (tmp.sum[i] > 0) trip.sum.emplace_back(col, i, tmp.sum[i]); + } + std::fill(tmp.var.begin(), tmp.var.end(), 0.0); + std::fill(tmp.mean.begin(), tmp.mean.end(), 0.0); + std::fill(tmp.sum.begin(), tmp.sum.end(), 0.0); + std::fill(tmp.non_zeros.begin(), tmp.non_zeros.end(), 0.0); + continue; + } + // transpose == false + ncells += 1; + if (ncells < group_count_vec[g_idx]) continue; + for (uint32_t i = 0; i < tmp.var.size(); i++) { + if (tmp.mean[i] > 0) trip.var.emplace_back(i, g_idx, (tmp.var[i] + (tmp.mean[i] * tmp.mean[i] * tmp.non_zeros[i] * (group_count_vec[g_idx] - tmp.non_zeros[i]) / group_count_vec[g_idx])) / (group_count_vec[g_idx] - 1)); + } + for (uint32_t i = 0; i < tmp.mean.size(); i++) { + if (tmp.mean[i] > 0) trip.mean.emplace_back(i, g_idx, tmp.mean[i] * tmp.non_zeros[i] / group_count_vec[g_idx]); + } + for (uint32_t i = 0; i < tmp.non_zeros.size(); i++) { + if (tmp.non_zeros[i] > 0) trip.non_zeros.emplace_back(i, g_idx, tmp.non_zeros[i]); + } + for (uint32_t i = 0; i < tmp.sum.size(); i++) { + if (tmp.sum[i] > 0) trip.sum.emplace_back(i, g_idx, tmp.sum[i]); + } + std::fill(tmp.var.begin(), tmp.var.end(), 0.0); + std::fill(tmp.mean.begin(), tmp.mean.end(), 0.0); + std::fill(tmp.sum.begin(), tmp.sum.end(), 0.0); + std::fill(tmp.non_zeros.begin(), tmp.non_zeros.end(), 0.0); + g_idx += 1; + ncells = 0; + } + if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) res.non_zeros.setFromTriplets(trip.non_zeros.begin(), trip.non_zeros.end()); + if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) res.sum.setFromTriplets(trip.sum.begin(), trip.sum.end()); + if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) res.mean.setFromTriplets(trip.mean.begin(), trip.mean.end()); + if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) res.var.setFromTriplets(trip.var.begin(), trip.var.end()); + return res; +}; +template PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr>&& mat, + const std::vector& cell_groups, + PseudobulkStatsMethod method, + bool transpose, + std::atomic *user_interrupt); +template PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr>&& mat, + const std::vector& cell_groups, + PseudobulkStatsMethod method, + bool transpose, + std::atomic *user_interrupt); +template PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr>&& mat, + const std::vector& cell_groups, + PseudobulkStatsMethod method, + bool transpose, + std::atomic *user_interrupt); +} // namespace BPCells diff --git a/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.h b/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.h new file mode 100644 index 00000000..17882637 --- /dev/null +++ b/r/src/bpcells-cpp/matrixUtils/PseudobulkSparse.h @@ -0,0 +1,35 @@ +#pragma once +#include "Pseudobulk.h" +#include + +namespace BPCells { + +struct PseudobulkStatsSparse { + Eigen::SparseMatrix non_zeros; + Eigen::SparseMatrix sum; + Eigen::SparseMatrix mean; + Eigen::SparseMatrix var; +}; + +struct PseudobulkStatsTriplet { + std::vector> non_zeros; + std::vector> sum; + std::vector> mean; + std::vector> var; +}; + +struct PseudobulkStatsTemp { + std::vector non_zeros; + std::vector sum; + std::vector mean; + std::vector var; +}; + +template +PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr> &&mat, + const std::vector& cell_groups, + PseudobulkStatsMethod method, + bool transpose, + std::atomic *user_interrupt); + +} // namespace BPCells diff --git a/r/src/matrix_utils.cpp b/r/src/matrix_utils.cpp index 017c98c0..db21e1af 100644 --- a/r/src/matrix_utils.cpp +++ b/r/src/matrix_utils.cpp @@ -1,5 +1,5 @@ // Copyright 2021 BPCells contributors -// +// // Licensed under the Apache License, Version 2.0 or the MIT license // , at your @@ -24,6 +24,7 @@ #include "bpcells-cpp/matrixIterators/TSparseMatrixWriter.h" #include "bpcells-cpp/matrixUtils/WilcoxonRankSum.h" #include "bpcells-cpp/matrixUtils/Pseudobulk.h" +#include "bpcells-cpp/matrixUtils/PseudobulkSparse.h" #include "bpcells-cpp/matrixUtils/Quantile.h" #include "bpcells-cpp/matrixIterators/MatrixAccumulators.h" #include "R_array_io.h" @@ -693,7 +694,7 @@ List pseudobulk_matrix_cpp(SEXP mat, methodFlags = methodFlags | PseudobulkStatsMethod::NonZeros | PseudobulkStatsMethod::Mean | PseudobulkStatsMethod::Variance; } } - + PseudobulkStats res = run_with_R_interrupt_check( &pseudobulk_matrix, take_unique_xptr>(mat), @@ -709,6 +710,40 @@ List pseudobulk_matrix_cpp(SEXP mat, ); } + +// [[Rcpp::export]] +List pseudobulk_matrix_sparse_cpp(SEXP mat, + std::vector cell_groups, + std::vector method, + bool transpose) { + PseudobulkStatsMethod methodFlags = static_cast(0); + for (std::string &m : method) { + if (m == "nonzeros") { + methodFlags = methodFlags | PseudobulkStatsMethod::NonZeros; + } else if (m == "sum") { + methodFlags = methodFlags | PseudobulkStatsMethod::Sum; + } else if (m == "mean") { + methodFlags = methodFlags | PseudobulkStatsMethod::NonZeros | PseudobulkStatsMethod::Mean; + } else if (m == "variance") { + methodFlags = methodFlags | PseudobulkStatsMethod::NonZeros | PseudobulkStatsMethod::Mean | PseudobulkStatsMethod::Variance; + } + } + + PseudobulkStatsSparse res = run_with_R_interrupt_check( + &pseudobulk_matrix_sparse, + take_unique_xptr>(mat), + std::cref(cell_groups), + (PseudobulkStatsMethod)methodFlags, + transpose + ); + return List::create( + Named("nonzeros") = res.non_zeros, + Named("sum") = res.sum, + Named("mean") = res.mean, + Named("variance") = res.var + ); +} + // [[Rcpp::export]] Eigen::ArrayXXd matrix_quantile_per_col_cpp(SEXP mat, std::vector quantile, double alpha, double beta) { return run_with_R_interrupt_check( diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 85ad0be3..78b885bd 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -1,5 +1,5 @@ # Copyright 2023 BPCells contributors -# +# # Licensed under the Apache License, Version 2.0 or the MIT license # , at your @@ -28,7 +28,7 @@ test_that("Wilcoxon rank sum works (small)", { colnames(X) <- c("centered", "uncentered") Y <- factor(c(rep.int("X", length(x)), rep.int("Y", length(y)))) - m <- as(X, "dgCMatrix") %>% + m <- as(X, "dgCMatrix") %>% as("IterableMatrix") %>% t() @@ -36,7 +36,7 @@ test_that("Wilcoxon rank sum works (small)", { dplyr::filter(foreground == "X", feature == "centered") res_bp_2 <- marker_features(m, Y) %>% dplyr::filter(foreground == "Y", feature == "uncentered") - + expect_equal(res_bp_1$p_val_raw, res_r$p.value) expect_equal(res_bp_1$p_val_raw, res_bp_2$p_val_raw) @@ -45,7 +45,7 @@ test_that("Wilcoxon rank sum works (small)", { expect_equal(res_bp_2$foreground_mean, mean(y)) expect_equal(res_bp_2$background_mean, mean(x)) - + expect_equal(res_bp_1$feature, "centered") }) @@ -57,14 +57,14 @@ test_that("Wilcoxon rank sum matches immunogenomics::presto", { clusters <- pbmc3k.SeuratData::pbmc3k.final@active.ident res_presto <- presto::wilcoxauc(mat, clusters) %>% tibble::as_tibble() - - mat_bpcells <- mat %>% - t() %>% - as("IterableMatrix") %>% + + mat_bpcells <- mat %>% + t() %>% + as("IterableMatrix") %>% t() res_bpcells <- marker_features(mat_bpcells, clusters) %>% dplyr::arrange(match(foreground, levels(clusters)), match(feature, rownames(mat))) - + expect_equal(res_bpcells$feature, res_presto$feature) expect_equal(res_bpcells$foreground, res_presto$group) expect_equal(res_bpcells$p_val_raw, res_presto$pval) @@ -116,11 +116,25 @@ test_that("Pseudobulk aggregation works", { # check for clipping if matrix isn't transposed expect_equal(m_mean, m_bpcells_mean) expect_equal(m_non_zeros, m_bpcells_non_zeros) + # check for `sparse = TRUE` works + m_bpcells_sum <- pseudobulk_matrix(m_bpcells, cell_group, method = "sum", sparse = TRUE) + m_bpcells_mean <- pseudobulk_matrix(m_bpcells, cell_group, method = "mean", sparse = TRUE) + m_bpcells_non_zeros <- pseudobulk_matrix(m_bpcells, cell_group, method = "nonzeros", sparse = TRUE) + expect_s4_class(m_bpcells_sum, "dgCMatrix") + expect_s4_class(m_bpcells_mean, "dgCMatrix") + expect_s4_class(m_bpcells_non_zeros, "dgCMatrix") + expect_equal(m_sum, as.matrix(m_bpcells_sum)) + expect_equal(m_mean, as.matrix(m_bpcells_mean)) + expect_equal(m_non_zeros, as.matrix(m_bpcells_non_zeros)) # make sure that we dont check for variances if we have number of groups == number of cells if (length(unique(cell_group)) < ncol(m)) { m_var <- create_pseudobulk_r(m, cell_group, "var") m_bpcells_var <- pseudobulk_matrix(m_bpcells, cell_group, method = "variance") expect_equal(m_var, m_bpcells_var, info = paste("Transposed:", m_bpcells@transpose)) + # check for `sparse = TRUE` works + m_bpcells_var <- pseudobulk_matrix(m_bpcells, cell_group, method = "variance", sparse = TRUE) + expect_s4_class(m_bpcells_var, "dgCMatrix") + expect_equal(m_var, as.matrix(m_bpcells_var), info = paste("Transposed:", m_bpcells@transpose)) } } } @@ -159,6 +173,27 @@ test_that("Pseudobulk aggregation works with multiple return types", { expect_equal(m_non_zeros, m_bpcells_res$nonzeros) } } + # check for `sparse = TRUE` works + for (second_method_idx in seq(first_method_idx+1, length(methods))) { + methods_of_interest <- c(methods[first_method_idx], methods[second_method_idx]) + m_bpcells_res <- pseudobulk_matrix(m_bpcells, cell_group, method = methods_of_interest, sparse = TRUE) + if ("variance" %in% methods_of_interest) { + expect_s4_class(m_bpcells_res$var, "dgCMatrix") + expect_equal(m_var, as.matrix(m_bpcells_res$var)) + } + if ("mean" %in% methods_of_interest) { + expect_s4_class(m_bpcells_res$mean, "dgCMatrix") + expect_equal(m_mean, as.matrix(m_bpcells_res$mean)) + } + if ("sum" %in% methods_of_interest) { + expect_s4_class(m_bpcells_res$sum, "dgCMatrix") + expect_equal(m_sum, as.matrix(m_bpcells_res$sum)) + } + if ("nonzeros" %in% methods_of_interest) { + expect_s4_class(m_bpcells_res$nonzeros, "dgCMatrix") + expect_equal(m_non_zeros, as.matrix(m_bpcells_res$nonzeros)) + } + } } } -}) \ No newline at end of file +})