-
Notifications
You must be signed in to change notification settings - Fork 31
[r][cpp] Add sparse = TRUE as an option for pseudobulk_matrix()
#268
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| # Copyright 2023 BPCells contributors | ||
| # | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
| # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
| # <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your | ||
|
|
@@ -10,8 +10,8 @@ | |
| #' | ||
| #' Given a features x cells matrix, perform one-vs-all differential | ||
| #' tests to find markers. | ||
| #' | ||
| #' Tips for using the values from this function: | ||
| #' | ||
| #' Tips for using the values from this function: | ||
| #' - Use `dplyr::mutate()` to add columns for e.g. adjusted p-value and log fold change. | ||
| #' - Use `dplyr::filter()` to get only differential genes above some given threshold | ||
| #' - To get adjusted p-values, use R `p.adjust()`, recommended method is "BH" | ||
|
|
@@ -21,9 +21,9 @@ | |
| #' | ||
| #' @param mat IterableMatrix object of dimensions features x cells | ||
| #' @param groups Character/factor vector of cell groups/clusters. Length #cells | ||
| #' @param method Test method to use. Current options are: | ||
| #' @param method Test method to use. Current options are: | ||
| #' - `wilcoxon`: Wilconxon rank-sum test a.k.a Mann-Whitney U test | ||
| #' @return tibble with the following columns: | ||
| #' @return tibble with the following columns: | ||
| #' - **foreground**: Group ID used for the foreground | ||
| #' - **background**: Group ID used for the background (or NA if comparing to rest of cells) | ||
| #' - **feature**: ID of the feature | ||
|
|
@@ -67,7 +67,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { | |
| foreground_means <- multiply_rows(group_sums, 1 / as.numeric(table(groups))) | ||
| background_means <- add_cols(-group_sums, colSums(group_sums)) %>% | ||
| multiply_rows(1 / (length(groups) - as.numeric(table(groups)))) | ||
|
|
||
| feature_names <- if (is.null(rownames(mat))) seq_len(nrow(mat)) else rownames(mat) | ||
|
|
||
| tibble::tibble( | ||
|
|
@@ -89,6 +89,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { | |
| #' | ||
| #' Current options are: `nonzeros`, `sum`, `mean`, `variance`. | ||
| #' @param threads (integer) Number of threads to use. | ||
| #' @param sparse (logical) Whether or not to return sparse matrices. | ||
| #' @return | ||
| #' - If `method` is length `1`, returns a matrix of shape `(features x groups)`. | ||
| #' - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method. | ||
|
|
@@ -124,7 +125,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { | |
| #' pseudobulk_res_multi$mean | ||
| #' @inheritParams marker_features | ||
| #' @export | ||
| pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { | ||
| pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L, sparse = FALSE) { | ||
| assert_is(mat, "IterableMatrix") | ||
| assert_is(cell_groups, c("factor", "character", "numeric")) | ||
| assert_true(length(cell_groups) == ncol(mat)) | ||
|
|
@@ -138,12 +139,21 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { | |
| } | ||
| assert_is_wholenumber(threads) | ||
|
|
||
| it <- mat %>% | ||
| convert_matrix_type("double") %>% | ||
| parallel_split(threads, threads*4) %>% | ||
| iterate_matrix() | ||
|
|
||
| res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) | ||
| if (sparse) { | ||
| new.order <- order(cell_groups) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little hesitant on doing this approach because shuffling actually isn't an O(1) operation for each idx (total O(n)). Due to the way subsetting logic works in BPCells, this is a O(nlog(n)) operation on the matrix. Any clue how much having cell_groups pre-ordered changes the speed for the cpp implementation? |
||
| cell_groups <- cell_groups[new.order] | ||
| it <- mat[, new.order] %>% | ||
| convert_matrix_type("double") %>% | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you make both sparse and dense cases use the same cpp adapter function, you can deduplicate 145-149 and 151-155, and take it out of the condtiional |
||
| parallel_split(threads, threads*4) %>% | ||
| iterate_matrix() | ||
| res <- pseudobulk_matrix_sparse_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) | ||
| } else { | ||
| it <- mat %>% | ||
| convert_matrix_type("double") %>% | ||
| parallel_split(threads, threads*4) %>% | ||
| iterate_matrix() | ||
| res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) | ||
| } | ||
| # if res is a single matrix, return with colnames and rownames | ||
| if (length(method) == 1) { | ||
| colnames(res[[method]]) <- levels(cell_groups) | ||
|
|
@@ -161,4 +171,4 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { | |
| } | ||
| } | ||
| return(res) | ||
| } | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
|
|
||
| #include "PseudobulkSparse.h" | ||
| namespace BPCells { | ||
|
|
||
| template <typename T> | ||
| PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr<MatrixLoader<T>> &&mat, | ||
| const std::vector<uint32_t>& cell_groups, | ||
| PseudobulkStatsMethod method, | ||
| bool transpose, | ||
| std::atomic<bool> *user_interrupt) { | ||
| MatrixIterator<T> it(std::move(mat)); | ||
| if (transpose && (it.rows() != cell_groups.size())) { | ||
| throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must match the number of columns in the matrix"); | ||
| } else if (!transpose && (it.cols() != cell_groups.size())) { | ||
| throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must match the number of columns in the matrix"); | ||
| } | ||
| // Count each group | ||
| std::vector<double> group_count_vec; | ||
| uint32_t last_cell = 0; | ||
| for (auto& cell_group : cell_groups) { | ||
| if (cell_group < last_cell) { | ||
| throw std::invalid_argument("pseudobulk_matrix_sparse(): Cell groups must be ordered"); | ||
| } | ||
| last_cell = cell_group; | ||
| if (cell_group >= group_count_vec.size()) group_count_vec.resize(cell_group + 1); | ||
| group_count_vec[cell_group]++; | ||
| } | ||
| Eigen::Map<Eigen::Array<double, 1, Eigen::Dynamic>> group_count(group_count_vec.data(), group_count_vec.size()); | ||
| // If transposed, also keep matrix transposed with groups as rows for cache-friendliness | ||
| uint32_t num_rows = transpose ? it.cols() : it.rows(); | ||
| uint32_t num_cols = group_count_vec.size(); | ||
| uint32_t tmp_size; | ||
| if (transpose) { | ||
| tmp_size = num_cols; | ||
| } else { | ||
| tmp_size = num_rows; | ||
| } | ||
| struct PseudobulkStatsSparse res; | ||
| struct PseudobulkStatsTriplet trip; | ||
| struct PseudobulkStatsTemp tmp; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More comments on what these are used for |
||
| if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { | ||
| res.non_zeros = Eigen::SparseMatrix<double>(num_rows, num_cols); | ||
| trip.non_zeros.reserve(num_rows * num_cols); | ||
| tmp.non_zeros = std::vector<double>(tmp_size, 0.0); | ||
| } | ||
| if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { | ||
| res.sum = Eigen::SparseMatrix<double>(num_rows, num_cols); | ||
| trip.sum.reserve(num_rows * num_cols); | ||
| tmp.sum = std::vector<double>(tmp_size, 0.0); | ||
| } | ||
| if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { | ||
| res.mean = Eigen::SparseMatrix<double>(num_rows, num_cols); | ||
| trip.mean.reserve(num_rows * num_cols); | ||
| tmp.non_zeros = std::vector<double>(tmp_size, 0.0); | ||
| tmp.mean = std::vector<double>(tmp_size, 0.0); | ||
| } | ||
| if ((method & PseudobulkStatsMethod::Variance)== PseudobulkStatsMethod::Variance) { | ||
| res.var = Eigen::SparseMatrix<double>(num_rows, num_cols); | ||
| trip.var.reserve(num_rows * num_cols); | ||
| tmp.non_zeros = std::vector<double>(tmp_size, 0.0); | ||
| tmp.mean = std::vector<double>(tmp_size, 0.0); | ||
| tmp.var = std::vector<double>(tmp_size, 0.0); | ||
| } | ||
| uint32_t ncells = 0; | ||
| uint32_t g_idx = 0; | ||
| // Use a one pass strategy that can do non-zero, mean, and variance at the same time | ||
| while (it.nextCol()) { | ||
| const uint32_t col = it.currentCol(); | ||
| if (user_interrupt != NULL && *user_interrupt) return res; | ||
| while (it.load()) { | ||
| const uint32_t *row_data = it.rowData(); | ||
| const T *val_data = it.valData(); | ||
| const uint32_t count = it.capacity(); | ||
| // Do transpose logic to calculate row_idx, col_idx out of loop, to avoid repeating conditional checks | ||
| if (transpose) { | ||
| // Doing variance calculation also does mean and non-zero count, so specific order is required to make sure we don't double count | ||
| if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) { | ||
| for (uint32_t i = 0; i < count; i++) update_mean_and_variance(val_data[i], tmp.non_zeros[cell_groups[row_data[i]]], tmp.mean[cell_groups[row_data[i]]], tmp.var[cell_groups[row_data[i]]]); | ||
| } else if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { | ||
| for (uint32_t i = 0; i < count; i++) update_mean(val_data[i], tmp.non_zeros[cell_groups[row_data[i]]], tmp.mean[cell_groups[row_data[i]]]); | ||
| } else if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { | ||
| for (uint32_t i = 0; i < count; i++) tmp.non_zeros[cell_groups[row_data[i]]] += 1; | ||
| } | ||
| if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { | ||
| for (uint32_t i = 0; i < count; i++) tmp.sum[cell_groups[row_data[i]]] += val_data[i]; | ||
| } | ||
| } else { | ||
| if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) { | ||
| for (uint32_t i = 0; i < count; i++) update_mean_and_variance(val_data[i], tmp.non_zeros[row_data[i]], tmp.mean[row_data[i]], tmp.var[row_data[i]]); | ||
| } else if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) { | ||
| for (uint32_t i = 0; i < count; i++) update_mean(val_data[i], tmp.non_zeros[row_data[i]], tmp.mean[row_data[i]]); | ||
| } else if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) { | ||
| for (uint32_t i = 0; i < count; i++) tmp.non_zeros[row_data[i]] += 1; | ||
| } | ||
| if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) { | ||
| for (uint32_t i = 0; i < count; i++) tmp.sum[row_data[i]] += val_data[i]; | ||
| } | ||
| } | ||
| } | ||
| if (transpose) { | ||
| for (uint32_t i = 0; i < tmp.var.size(); i++) { | ||
| if (tmp.mean[i] > 0) trip.var.emplace_back(col, i, (tmp.var[i] + (tmp.mean[i] * tmp.mean[i] * tmp.non_zeros[i] * ((group_count_vec[i] - tmp.non_zeros[i]) / group_count_vec[i]))) / (group_count_vec[i] - 1)); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.mean.size(); i++) { | ||
| if (tmp.mean[i] > 0) trip.mean.emplace_back(col, i, tmp.mean[i] * tmp.non_zeros[i] / group_count_vec[i]); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.non_zeros.size(); i++) { | ||
| if (tmp.non_zeros[i] > 0) trip.non_zeros.emplace_back(col, i, tmp.non_zeros[i]); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.sum.size(); i++) { | ||
| if (tmp.sum[i] > 0) trip.sum.emplace_back(col, i, tmp.sum[i]); | ||
| } | ||
| std::fill(tmp.var.begin(), tmp.var.end(), 0.0); | ||
| std::fill(tmp.mean.begin(), tmp.mean.end(), 0.0); | ||
| std::fill(tmp.sum.begin(), tmp.sum.end(), 0.0); | ||
| std::fill(tmp.non_zeros.begin(), tmp.non_zeros.end(), 0.0); | ||
| continue; | ||
| } | ||
| // transpose == false | ||
| ncells += 1; | ||
| if (ncells < group_count_vec[g_idx]) continue; | ||
| for (uint32_t i = 0; i < tmp.var.size(); i++) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unclear why we would do this at a first glance (even though I wrote the original code for the corrections for sparsity). Means we need some comments! |
||
| if (tmp.mean[i] > 0) trip.var.emplace_back(i, g_idx, (tmp.var[i] + (tmp.mean[i] * tmp.mean[i] * tmp.non_zeros[i] * (group_count_vec[g_idx] - tmp.non_zeros[i]) / group_count_vec[g_idx])) / (group_count_vec[g_idx] - 1)); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.mean.size(); i++) { | ||
| if (tmp.mean[i] > 0) trip.mean.emplace_back(i, g_idx, tmp.mean[i] * tmp.non_zeros[i] / group_count_vec[g_idx]); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.non_zeros.size(); i++) { | ||
| if (tmp.non_zeros[i] > 0) trip.non_zeros.emplace_back(i, g_idx, tmp.non_zeros[i]); | ||
| } | ||
| for (uint32_t i = 0; i < tmp.sum.size(); i++) { | ||
| if (tmp.sum[i] > 0) trip.sum.emplace_back(i, g_idx, tmp.sum[i]); | ||
| } | ||
| std::fill(tmp.var.begin(), tmp.var.end(), 0.0); | ||
| std::fill(tmp.mean.begin(), tmp.mean.end(), 0.0); | ||
| std::fill(tmp.sum.begin(), tmp.sum.end(), 0.0); | ||
| std::fill(tmp.non_zeros.begin(), tmp.non_zeros.end(), 0.0); | ||
| g_idx += 1; | ||
| ncells = 0; | ||
| } | ||
| if ((method & PseudobulkStatsMethod::NonZeros) == PseudobulkStatsMethod::NonZeros) res.non_zeros.setFromTriplets(trip.non_zeros.begin(), trip.non_zeros.end()); | ||
| if ((method & PseudobulkStatsMethod::Sum) == PseudobulkStatsMethod::Sum) res.sum.setFromTriplets(trip.sum.begin(), trip.sum.end()); | ||
| if ((method & PseudobulkStatsMethod::Mean) == PseudobulkStatsMethod::Mean) res.mean.setFromTriplets(trip.mean.begin(), trip.mean.end()); | ||
| if ((method & PseudobulkStatsMethod::Variance) == PseudobulkStatsMethod::Variance) res.var.setFromTriplets(trip.var.begin(), trip.var.end()); | ||
| return res; | ||
| }; | ||
| template PseudobulkStatsSparse pseudobulk_matrix_sparse<uint32_t>(std::unique_ptr<MatrixLoader<uint32_t>>&& mat, | ||
| const std::vector<uint32_t>& cell_groups, | ||
| PseudobulkStatsMethod method, | ||
| bool transpose, | ||
| std::atomic<bool> *user_interrupt); | ||
| template PseudobulkStatsSparse pseudobulk_matrix_sparse<float>(std::unique_ptr<MatrixLoader<float>>&& mat, | ||
| const std::vector<uint32_t>& cell_groups, | ||
| PseudobulkStatsMethod method, | ||
| bool transpose, | ||
| std::atomic<bool> *user_interrupt); | ||
| template PseudobulkStatsSparse pseudobulk_matrix_sparse<double>(std::unique_ptr<MatrixLoader<double>>&& mat, | ||
| const std::vector<uint32_t>& cell_groups, | ||
| PseudobulkStatsMethod method, | ||
| bool transpose, | ||
| std::atomic<bool> *user_interrupt); | ||
| } // namespace BPCells | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| #pragma once | ||
| #include "Pseudobulk.h" | ||
| #include <unordered_map> | ||
|
|
||
| namespace BPCells { | ||
|
|
||
| struct PseudobulkStatsSparse { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstrings would be helpful, why do we need all three of these new structs? I would even throw |
||
| Eigen::SparseMatrix<double> non_zeros; | ||
| Eigen::SparseMatrix<double> sum; | ||
| Eigen::SparseMatrix<double> mean; | ||
| Eigen::SparseMatrix<double> var; | ||
| }; | ||
|
|
||
| struct PseudobulkStatsTriplet { | ||
| std::vector<Eigen::Triplet<double>> non_zeros; | ||
| std::vector<Eigen::Triplet<double>> sum; | ||
| std::vector<Eigen::Triplet<double>> mean; | ||
| std::vector<Eigen::Triplet<double>> var; | ||
| }; | ||
|
|
||
| struct PseudobulkStatsTemp { | ||
| std::vector<double> non_zeros; | ||
| std::vector<double> sum; | ||
| std::vector<double> mean; | ||
| std::vector<double> var; | ||
| }; | ||
|
|
||
| template <typename T> | ||
| PseudobulkStatsSparse pseudobulk_matrix_sparse(std::unique_ptr<MatrixLoader<T>> &&mat, | ||
| const std::vector<uint32_t>& cell_groups, | ||
| PseudobulkStatsMethod method, | ||
| bool transpose, | ||
| std::atomic<bool> *user_interrupt); | ||
|
|
||
| } // namespace BPCells | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.