diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 17faadb9..1f5a3842 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -48,7 +48,9 @@ Suggests: IRanges, GenomicRanges, matrixStats, - igraph + igraph, + RcppHNSW, + RcppAnnoy Depends: R (>= 3.5.0) Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs diff --git a/r/NAMESPACE b/r/NAMESPACE index 87862f4b..96a72269 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -2,11 +2,18 @@ S3method(base::as.data.frame,IterableFragments) S3method(base::as.matrix,IterableMatrix) +S3method(print,DimReduction) +S3method(project,IterativeLSI) +S3method(project,LSI) +S3method(project,default) S3method(svds,IterableMatrix) S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") +export(DimReduction) +export(IterativeLSI) +export(LSI) export(add_cols) export(add_rows) export(all_matrix_inputs) @@ -20,6 +27,7 @@ export(canonical_gene_symbol) export(cellNames) export(checksum) export(chrNames) +export(cluster_cells_graph) export(cluster_graph_leiden) export(cluster_graph_louvain) export(cluster_graph_seurat) @@ -65,6 +73,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) @@ -83,6 +93,7 @@ export(plot_tf_footprint) export(plot_tss_profile) export(plot_tss_scatter) export(prefix_cell_names) +export(project) export(pseudobulk_matrix) export(qc_scATAC) export(range_distance_to_nearest) @@ -108,6 +119,10 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_binned_dispersion) +export(select_features_dispersion) +export(select_features_mean) +export(select_features_variance) export(select_regions) export(set_trackplot_height) export(set_trackplot_label) diff --git a/r/NEWS.md b/r/NEWS.md index e75ce4a0..c695e83b 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -8,8 +8,17 @@ Contributions welcome :) # BPCells 0.3.1 (in-progress main branch) +## Breaking changes +- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type. (pull request #189) + ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #189) +- Add feature selection functions `select_features_variance()`, and `select_features_{dispersion,mean,binned_dispersion}()`, with parameterization for normalization steps, and number of variable features (pull request #189) +- Add `LSI()` and `IterativeLSI()` dimensionality functions to perform latent semantic indexing on a matrix (pull request #189). +- Add capability to create partial function objects in when excluding the first argument of a function. This is implemented in normalizations, feature selections, dimensionality reductions, and clustering functions. See `select_features_variance()` for usage. (pull request #189) +- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #189) ## Improvements - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179) diff --git a/r/R/clustering.R b/r/R/clustering.R index b53a4edf..f032800f 100644 --- a/r/R/clustering.R +++ b/r/R/clustering.R @@ -6,6 +6,84 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +#' Check if an input is a kNN object +#' +#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`. +#' These are used as inputs to create graph adjacency matrices for clustering. +#' Assume any list with at least both `idx` and `dist` items is a kNN object. +#' @return TRUE if the mat is a knn object, FALSE otherwise +#' @keywords internal +is_knn_object <- function(mat) { + return(is(mat, "list") && all(c("idx", "dist") %in% names(mat))) +} + +#' Check if an input is a graph adjacency matrix. +#' +#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input. +#' We assume that any square `dgCMatrix` is a graph adjacency matrix. +#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise +#' @keywords internal +is_adjacency_matrix <- function(mat) { + return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat)) +} + +#' Cluster embeddings using a KNN-Graph based algorithm +#' +#' Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to +#' a graph adjacency matrix. Following, assign a label to every cell using a clustering algorithm. +#' +#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)` +#' @param knn_method (function) Function to convert cell embeddings into a knn object. +#' Must be a (optionally partialized) version of `knn_hnsw()` or `knn_annoy()`. +#' @param knn_to_graph_method (function) Function to convert the knn object returned from `knn_method` to a graph adjacency matrix. +#' Must be a (optionally partialized) version of `knn_to_graph()`, `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. +#' @param graph_to_cluster_method (function) Clustering algorithm that converts a graph adjacency matrix +#' returned from `graph_to_cluster_method` into cluster labels for each cell. +#' Must be a (optionally partialized) version of `cluster_graph_leiden()`, `cluster_graph_louvain()` or `cluster_graph_seurat()`. +#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`. If these functions do not utilize +#' a `threads` argument, this is silently ignored. +#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `graph_to_cluster_method`. If these functions do not utilize +#' a `verbose` argument, this is silently ignored. +#' @returns (factor) Factor vector containing the cluster assignment for each cell. +#' @details +#' `cluster_cells_graph()` acts as a helper function to wrap input creation and `kNN` graph adjacency-based clustering to be done together. The user +#' can also manually pass cell embeddings to their preferred knn/clustering functions of choices. +#' +#' **Clustering customization through partialized parameters** +#' +#' Customization of clustering is possible through partialization of each parameter in `cluster_cells_graph()` that is a function. +#' In detail, each parameter that requests a function +#' may take in one with only some of the arguments provided. If the first argument is not provided, a copy of a function is utilized that has its parameters +#' changed with the arguments provided. +#' +#' For instance, if the user desires for `cluster_cells_graph()` to instead use `cluster_graph_louvain()` with resolution different than the default, +#' they can instead call `cluster_cells_graph()` like so: +#' `cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))` +#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` +#' @export +cluster_cells_graph <- function( + mat, knn_method = knn_hnsw, + knn_to_graph_method = knn_to_geodesic_graph, + graph_to_cluster_method = cluster_graph_leiden, + threads = 0L, verbose = FALSE +) { + assert_is_wholenumber(threads) + assert_is(verbose, "logical") + if (rlang::is_missing(mat)) return(create_partial()) + assert_is(mat, "matrix") + # There currently aren't any `knn_to_graph` functions that utilize a verbose argument. + # However, we still pass `verbose` in case future functions do provide this functionality. + mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat) + if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1) + # Return type has to be constrained to "matrix", so this is silently provided. + mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat) + if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1) + # Also pass verbose and threads to clustering functions in case they are given these params in the future + mat <- partial_apply(graph_to_cluster_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat) + return(mat) +} + #' K Nearest Neighbor (KNN) Graph #' #' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into @@ -22,6 +100,11 @@ #' Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to #' cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i` knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { + assert_is(use_weights, "logical") + assert_is(self_loops, "logical") + + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) if (use_weights) { weights <- knn$dist } else { @@ -41,7 +124,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { mat } - #' @rdname knn_graph #' @details **knn_to_snn_graph** #' Convert a knn object into a shared nearest neighbors adjacency matrix. @@ -63,6 +145,9 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { #' @export knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) { return_type <- match.arg(return_type) + assert_is(self_loops, "logical") + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val) min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val)) snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int) @@ -83,13 +168,15 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t } # Return as a sparse matrix - Matrix::sparseMatrix( + res <- Matrix::sparseMatrix( i = snn$i + 1L, j = snn$j + 1L, x = snn$weight, dims = c(snn$dim, snn$dim) ) + return(res) } #' @rdname knn_graph +#' @param threads Number of threads to use during calculations #' @details **knn_to_geodesic_graph** #' Convert a knn object into an undirected weighted graph, using the same #' geodesic distance estimation method as the UMAP package. @@ -101,7 +188,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t #' neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which #' assumes self is always successfully found in the approximate nearest neighbor search. #' -#' @param threads Number of threads to use during calculations #' @return **knn_to_geodesic_graph** #' - `return_type == "matrix"`: #' Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`. @@ -111,29 +197,31 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t #' These correspond to the rows, cols, and values of non-zero entries in the lower triangle #' adjacency matrix. `dim` is the total number of vertices (cells) in the graph #' @export -knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) { +knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) { return_type <- match.arg(return_type) + assert_is_wholenumber(threads) + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads) graph$dim <- nrow(knn$idx) - if (return_type == "list") { - return(graph) - } + if (return_type == "list") return(graph) # Return as a sparse matrix - Matrix::sparseMatrix( + res <- Matrix::sparseMatrix( i = graph$i + 1L, j = graph$j + 1L, x = graph$weight, dims = c(graph$dim, graph$dim) ) + return(res) } #' Cluster an adjacency matrix -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. #' Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`, #' so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a #' good default when `objective_function = "modularity"` per the default. -#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used +#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used. #' @param resolution Resolution parameter. Higher values result in more clusters #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). #' For the meaning of each option, see `igraph::cluster_leiden()`. @@ -141,44 +229,55 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads= #' @param ... Additional arguments to underlying clustering function #' @return Factor vector containing the cluster assignment for each cell. #' @export -cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) { +cluster_graph_leiden <- function( + mat, resolution = 1, objective_function = c("modularity", "CPM"), + seed = 12531, ... +) { assert_has_package("igraph") # Set seed without permanently changing seed state + if (rlang::is_missing(mat)) return(create_partial()) prev_seed <- get_seed() on.exit(restore_seed(prev_seed), add = TRUE) set.seed(seed) objective_function <- match.arg(objective_function) - igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>% + igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>% igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>% igraph::membership() %>% as.factor() } -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()` #' @export -cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) { +cluster_graph_louvain <- function( + mat, resolution = 1, seed = 12531 +) { assert_has_package("igraph") # Set seed without permanently changing seed state + if (rlang::is_missing(mat)) return(create_partial()) + prev_seed <- get_seed() on.exit(restore_seed(prev_seed), add = TRUE) set.seed(seed) - igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>% + igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>% igraph::cluster_louvain(resolution = resolution) %>% igraph::membership() %>% as.factor() } -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()` #' @export -cluster_graph_seurat <- function(snn, resolution = 0.8, ...) { +cluster_graph_seurat <- function( + mat, resolution = 0.8, ... +) { assert_has_package("Seurat") - Seurat::as.Graph(snn) %>% + if (rlang::is_missing(mat)) return(create_partial()) + Seurat::as.Graph(mat) %>% Seurat::FindClusters(resolution = resolution, ...) %>% .[[1]] } @@ -213,7 +312,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) { } -#' Get a knn matrix from reduced dimensions +#' Get a knn object from reduced dimensions #' #' Search for approximate nearest neighbors between cells in the reduced #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each @@ -228,7 +327,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) { #' @param metric distance metric to use #' @param threads Number of threads to use. Note that result is non-deterministic #' if threads > 1 -#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but +#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but #' improved accuracy #' @param verbose whether to print progress information during search #' @return List of 2 matrices -- idx for cell x K neighbor indices, @@ -238,6 +337,10 @@ cluster_membership_matrix <- function(groups, group_order = NULL) { #' @export knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) { metric <- match.arg(metric) + assert_is(verbose, "logical") + assert_is_wholenumber(threads) + assert_has_package("RcppHNSW") + if (rlang::is_missing(data)) return(create_partial()) index <- RcppHNSW::hnsw_build( data, distance = metric, @@ -271,8 +374,11 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine #' @param n_trees Number of trees during index build time. More trees gives higher accuracy #' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy #' @export -knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) { +knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) { metric <- match.arg(metric) + assert_has_package("RcppAnnoy") + if (rlang::is_missing(data)) return(create_partial()) + if (is.null(query)) query <- data annoy <- switch(metric, "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)), "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)), @@ -294,5 +400,6 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin dist[i, ] <- res$dist } if (metric == "cosine") dist <- 0.5 * (dist * dist) - list(idx = idx, dist = dist) + res <- list(idx = idx, dist = dist) + return(res) } diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R index 7691ab98..ed24087d 100644 --- a/r/R/errorChecking.R +++ b/r/R/errorChecking.R @@ -113,7 +113,12 @@ assert_is <- function(object, class, n = 1) { if (!match) pretty_error(object, sprintf("must have class %s", paste0(class, collapse = ", or ")), n) } } - +assert_is_mat <- function(object, n = 1) { + # matrices have length set to row*col instead of being 1, so we need to check dim as well + if (!canCoerce(object, "IterableMatrix")) { + pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n) + } +} assert_true <- function(expr, n = 1) { if (!expr) pretty_error(expr, "is not true", n) } diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index c77a9f26..11e5269e 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -1,4 +1,4 @@ -# Copyright 2023 BPCells contributors +# Copyright 2025 BPCells contributors # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,6 +6,700 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +################# +# Feature selection +################# + +#' Feature selection functions +#' +#' Apply a feature selection method to a non-normalized `(features x cells)` matrix. We recommend using counts matrices as input and to +#' apply any normalizations prior to feature selection via the normalize argument (if available). +#' Instead of directly subsetting the input matrix, +#' an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature. +#' @rdname feature_selection +#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`. +#' @param num_feats (float) Number of features to mark as highly_variable. If 0 < `num_feats` < 1, then interpret as a fraction of features. +#' @param normalize_method (function) Used to normalize the matrix prior to feature selection by calling `normalize_method(mat)` if it is not `NULL`. +#' For example, pass `normalize_log()` or `normalize_tfidf()`. +#' @param threads (integer) Number of threads to use. Also overrides the threads argument in `normalize_method` +#' @returns +#' Return a dataframe with the following columns: +#' - `feature`: Feature name. +#' - `score`: Scoring of the feature, depending on the method used. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' +#' Each different feature selection method will have a different scoring method. +#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature +#' and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores: +#' - `select_features_variance`: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2} +#' @examples +#' ## Prep data +#' set.seed(12345) +#' mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8) +#' rownames(mat) <- paste0("gene", seq_len(nrow(mat))) +#' mat +#' mat <- as(mat, "IterableMatrix") +#' +#' +#' ####################################################################### +#' ## select_features_variance() examples +#' ####################################################################### +#' select_features_variance( +#' mat, +#' num_feats=2, +#' normalize_method=normalize_log +#' ) +#' +#' # Because of how the BPCells `normalize` functions behave when the matrix +#' # argument is missing, we can also customize the normalization parameters +#' # using partial arguments: +#' variable_features <- select_features_variance( +#' mat, +#' num_feats=2, +#' normalize_method=normalize_log(scale_factor=20) +#' ) +#' # One can then filter to only variable features using the subset operator: +#' mat[variable_features$feature[variable_features$highly_variable],] +#' +#' +#' @seealso `normalize_tfidf()` `normalize_log()` +#' @export +select_features_variance <- function( + mat, num_feats = 0.05, + normalize_method = NULL, + threads = 1L +) { + assert_greater_than_zero(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + assert_is_mat(mat) + + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (min(max(num_feats, 0), nrow(mat)) != num_feats) { + rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))) + } + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat) + features_df <- tibble::tibble( + feature = rownames(mat), + score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",] + ) %>% + dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) + return(features_df) +} + + +#' @rdname feature_selection +#' @returns +#' - `select_features_dispersion`: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}} +#' @examples +#' ####################################################################### +#' ## select_features_dispersion() example +#' ####################################################################### +#' select_features_dispersion( +#' mat, +#' num_feats = 2, +#' normalize_method = normalize_log +#' ) +#' +#' +#' @export +select_features_dispersion <- function( + mat, num_feats = 0.05, + normalize_method = NULL, + threads = 1L +) { + assert_greater_than_zero(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (min(max(num_feats, 0), nrow(mat)) != num_feats) { + rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))) + } + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (!is(mat, "IterableMatrix") && canCoerce(mat, "IterableMatrix")) mat <- as(mat, "IterableMatrix") + assert_is(mat, "IterableMatrix") + if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat) + mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) + features_df <- tibble::tibble( + feature = rownames(mat), + score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ] + ) %>% + dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) + return(features_df) +} + +#' @rdname feature_selection +#' @returns +#' - `select_features_mean`: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}} +#' @examples +#' ####################################################################### +#' ## select_features_mean() example +#' ####################################################################### +#' select_features_mean( +#' mat, +#' num_feats = 1, +#' normalize_method = normalize_log +#' ) +#' +#' +#' @export +select_features_mean <- function(mat, num_feats = 0.05, normalize_method = NULL, threads = 1L) { + assert_greater_than_zero(num_feats) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + assert_is_mat(mat) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (min(max(num_feats, 0), nrow(mat)) != num_feats) { + rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))) + } + num_feats <- min(max(num_feats, 0), nrow(mat)) + if (!is.null(normalize_method)) mat <- partial_apply(normalize_method, threads = threads, .missing_args_error = FALSE)(mat) + # get the sum of each feature, binarized + features_df <- tibble::tibble( + feature = rownames(mat), + score = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean", ] + ) %>% + dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) + return(features_df) +} + +#' @rdname feature_selection +#' @param n_bins (integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details). +#' @returns +#' - `select_features_binned_dispersion`: Process described in `details`. +#' @details +#' `select_features_binned_dispersion` implements the approach from Satija et al. 2015: +#' 1. Bin features into equal-width bins by `log1p(mean)` +#' 2. Calculate dispersion of each feature as `log(variance / mean)` +#' 3. Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins +#' +#' If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1. +#' +#' This should be equivalent to `Seurat::FindVariableFeatures()` with `selection.method="mean.var.plot"` +#' and `scanpy.pp.highly_variable_genes()` with `flavor="seurat"`. +#' @examples +#' ####################################################################### +#' ## select_features_binned_dispersion() example +#' ####################################################################### +#' select_features_binned_dispersion( +#' mat, +#' num_feats = 2, +#' n_bins = 2 +#' ) +#' +#' +#' @export +select_features_binned_dispersion <- function( + mat, num_feats = 0.05, n_bins = 20, + threads = 1L +) { + assert_greater_than_zero(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + assert_is_mat(mat) + if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats) + if (min(max(num_feats, 0), nrow(mat)) != num_feats) { + rlang::warn(add_timestamp(sprintf("Number of features asked for (%s) is greater than the number of features in the matrix (%s).", num_feats, nrow(mat)))) + } + num_feats <- min(max(num_feats, 0), nrow(mat)) + # Calculate row information for dispersion + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] + # Calculate dispersion, and log normalize + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 + feature_means <- log1p(feature_means) + features_df <- tibble::tibble( + feature = names(feature_means), + var = feature_vars, + mean = feature_means, + dispersion = feature_dispersion + ) + # Bin by mean, and normalize dispersion with each bin + features_df <- features_df %>% + dplyr::mutate(bin = cut(mean, n_bins, labels = FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + score = (dispersion - mean(dispersion)) / sd(dispersion), + score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% + dplyr::ungroup() %>% + dplyr::mutate(highly_variable = dplyr::row_number(dplyr::desc(score)) <= num_feats) %>% + dplyr::select(c("feature", "score", "highly_variable", "dispersion", "bin")) %>% + dplyr::rename("raw_log_dispersion" = "dispersion") + return(features_df) +} + + +################# +# DimReduction Class Definition +################# + +#' Barebones definition of a DimReduction class. +#' +#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +#' Child classes should implement a `project` method to allow for the projection of other matrices using +#' the fitted transformation object. +#' @rdname DimReduction +#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) Projected data of shape `(cells x n_dimensions)` of the original matrix after a dimensionality reduction. +#' @field fitted_params (list) A list of parameters used for the transformation of a matrix. This should include all necessary information to project new data with the same features. +#' @field feature_names (character) The names of the features that this DimReduction object was fit on. Matrices to be projected should have the same feature names. +#' @return - `DimReduction()`: DimReduction object. +#' @export +DimReduction <- function(cell_embeddings, fitted_params = list(), feature_names = character(0), ...) { + assert_is(cell_embeddings, c("IterableMatrix", "dgCMatrix", "matrix")) + assert_is(fitted_params, "list") + structure(list( + cell_embeddings = cell_embeddings, + fitted_params = fitted_params, + feature_names = feature_names, + ... + ), + class = "DimReduction" + ) +} + +#' @rdname DimReduction +#' @param (IterableMatrix) Input matrix of shape `(features x cells)`. +#' @param x DimReduction object. +#' @return - `project()`: IterableMatrix object of the projected data. +#' @details +#' `project()`: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. +#' +#' DimReduction subclasses should use the `project` method on new data with the same features, to project into the same latent space. +#' All required information to run a projection should be held in `x$fitted_params`, including pertinent parameters when constructing the DimReduction subclass object. +#' +#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +#' If there are rownames, reorder the matrix to match the order of the original matrix +#' @inheritParams DimReduction +#' @export +project <- function(x, mat, ...) { + UseMethod("project") +} +#' @export +project.default <- function(x, mat, ...) { + rlang::abort("project method not implemented for objects that are not a fitted DimReduction") +} + +#' @export +print.DimReduction <- function(x, ...) { + cat(sprintf("Fitted <%s> dimensionality reduction\n\n", class(x)[1])) + + # Print feature info + cat("Number of features:", length(x$feature_names), "\n") + cat("Input feature names:", pretty_print_vector(x$feature_names), "\n") + + # Print embedding info + dim_embeddings <- dim(x$cell_embeddings) + cat(sprintf("cell_embeddings: %d cells x %d embedding dimensions\n", dim_embeddings[1], dim_embeddings[2])) + + # Print param info + cat("fitted_params: ", stringr::str_wrap(paste(names(x$fitted_params), collapse = ", "), exdent = 2, width = 60), "\n") +} + +################# +# LSI Implementation +################# + + +#' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation +#' of the matrix of shape `(n_dimensions, ncol(mat))`. +#' Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space. +#' @rdname LSI +#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`. +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is greater or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. +#' @param scale_factor (numeric) Scaling factor to multiply matrix by during tf-idf normalization. +#' @param threads (integer) Number of threads to use. +#' @returns +#' `LSI()` An object of class `c("LSI", "DimReduction")` with the following attributes: +#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)` +#' - `fitted_params`: A named list of the parameters used for LSI, including the following: +#' - `scale_factor`: The scale factor used for tf-idf normalization +#' - `feature_means`: The means of the features used for normalization +#' - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth +#' - `feature_loadings`: SVD component with dimension `(n_variable_features, n_dimensions)` used to linearly transform input data into cell embeddings +#' - `feature_names`: The names of the features in the matrix +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - 17.1 MB memory usage, 25.1 seconds runtime +#' @seealso `project()` `DimReduction()` `normalize_tfidf()` `normalize_log()` `svds()` +#' @examples +## Prep data +#' nrows <- 50 +#' ncols <- 1000 +#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix") +#' rownames(mat) <- paste0("feat", seq(nrows)) +#' colnames(mat) <- paste0("cell", seq(ncols)) +#' +#' +#' ####################################################################### +#' ## LSI() example +#' ####################################################################### +#' lsi_result <- LSI(mat, n_dimensions = 10) +#' lsi_result +#' +#' +#' @export +LSI <- function( + mat, n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4, + threads = 1L, verbose = FALSE +) { + if (rlang::is_missing(mat)) { + return(create_partial()) + } + assert_is_mat(mat) + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) + assert_is_wholenumber(threads) + + if (verbose) log_progress("Starting LSI") + # Normalization + if (verbose) log_progress("Normalizing matrix") + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads) + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + mat <- normalize_tfidf( + mat = mat, + feature_means = mat_stats$row_stats["mean", ], + scale_factor = scale_factor, + threads = threads + ) + # Save to prevent repeating queued calculations during SVD + temp_mat_dir <- tempfile("mat") + mat <- write_matrix_dir( + convert_matrix_type(mat, type = "float"), + temp_mat_dir, compress = TRUE + ) + on.exit(unlink(temp_mat_dir, recursive = TRUE, expand = FALSE)) + # Run pca + if (verbose) log_progress("Calculating SVD") + svd_attr <- svds(mat, k = n_dimensions, threads = threads) + pca_res <- svd_attr$v %*% diag(svd_attr$d) + rownames(pca_res) <- colnames(mat) + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, pca_res)) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", "))) + pca_res <- pca_res[, pca_feats_to_keep, drop = FALSE] + } + fitted_params <- list( + scale_factor = scale_factor, + feature_means = mat_stats$row_stats["mean", ], + pcs_to_keep = pca_feats_to_keep, + feature_loadings = svd_attr$u + ) + res <- DimReduction( + cell_embeddings = pca_res, + fitted_params = fitted_params, + feature_names = rownames(mat) + ) + class(res) <- c("LSI", class(res)) + return(res) +} +#' @rdname LSI +#' @return +#' `project()` IterableMatrix of the projected data of shape `(n_dimensions, ncol(mat))`. +#' @inheritParams project +#' @examples +#' ####################################################################### +#' ## project() example +#' ####################################################################### +#' dim(project(lsi_result, mat)) +#' +#' +#' @export +project.LSI <- function(x, mat, threads = 1L, ...) { + assert_is_mat(mat) + assert_is(x, "LSI") + + fitted_params <- x$fitted_params + # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u + assert_true(nrow(mat) == nrow(fitted_params$feature_loadings)) + if (!is.null(rownames(mat)) && !is.null(x$feature_names)) { + assert_true(all(x$feature_names %in% rownames(mat))) + mat <- mat[x$feature_names, ] + } + mat <- normalize_tfidf( + mat, + feature_means = fitted_params$feature_means, + scale_factor = fitted_params$scale_factor, + threads = threads + ) + feature_loadings <- fitted_params$feature_loadings + res <- t(mat) %*% feature_loadings + if (length(fitted_params$pcs_to_keep) != ncol(res)) { + res <- res[, fitted_params$pcs_to_keep, drop = FALSE] + } + return(res) +} + + + +#' Run Iterative LSI on a matrix. +#' +#' Given a `(features x cells)` counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. +#' This uses the method described in [ArchR](https://doi.org/10.1038/s41588-021-00790-6) (Granja et al; 2019). +#' See details for more specific information. Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space. +#' @rdname IterativeLSI +#' @param mat (IterableMatrix) Counts matrix of shape `(features x cells)`. +#' @param n_iterations (int) The number of LSI iterations to perform. +#' @param feature_selection_method (function) Method to use for selecting features for LSI. +#' Current builtin options are `select_features_variance`, `select_features_dispersion`, `select_features_mean`, `select_features_binned_dispersion` +#' @param cluster_method (function) Method to use for clustering the post-SVD matrix. +#' The user can pass in partial parameters to the cluster method, such as by passing +#' `cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))` into `cluster_method`. +#' @param threads (integer) Number of threads to use. Also gets passed down into `feature_selection_method` and `cluster_method` +#' @return +#' `IterativeLSI()` An object of class `c("IterativeLSI", "DimReduction")` with the following attributes: +#' - `cell_embeddings`: The projected data as a matrix of shape `(cells, n_dimensions)` +#' - `feature_names`: The names of features that `IterativeLSI()` was fit on. +#' Note that projection only requires the features used in each specific iteration (as described in `iter_info`) +#' - `fitted_params`: A list of the parameters used for iterative LSI. Includes the following: +#' - `lsi_method`: The method used for LSI +#' - `feature_selection_method`: The method used for selecting features +#' - `cluster_method`: The method used for clustering +#' - `feature_means`: The means of the features used for tf-idf normalization +#' - `iterations`: The number of LSI iterations ran +#' - `iter_info`: A tibble of iteration info with rows as iterations. Columns include the following: +#' - `iteration`: The iteration number +#' - `feature_names`: The names of the features used for the iteration +#' - `feature_loadings`: SVD component `u` with dimension `(n_dimensions, n_variable_features)` +#' - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth +#' - `clusters`: The clusters for the iteration. This is blank for the first iteration +#' @details +#' The Iterative LSI method is as follows: +#' - First iteration: +#' - Select features using `feature_selection_method` +#' - Perform LSI on the selected features +#' - If `n_iterations` is 1, return the projected data from the first PCA projection +#' - Else, cluster the LSI results using `cluster_method` +#' - For each subsequent iteration: +#' - Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters +#' - Perform LSI on the selected features +#' - If this is the final iteration, return the projected data from this PCA projection +#' - Else, cluster the LSI results using `cluster_method` +#' +#' There are some minor differences when compared to the ArchR implementation: +#' - ArchR binarizes data prior to feature selection. To replicate this, the user can pass `select_features_variance(normalize=binarize)` for their `feature_selection_method`. +#' - `IterativeLSI()` currently does not support utilization of different feature selection methods across each iteration. +#' If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous +#' iteration and use them to select features and run LSI. +#' - ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into `IterativeLSI()`, +#' the default for `select_features_variance()` instead picks the number of variable features as a proportion of the total features provided. To mimic the ArchR implementation, +#' `feature_selection_method` can be set to `select_features_variance(num_feats = 25000)`. +#' - ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells. ArchR does this to prevent a memory bottleneck, +#' which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration. +#' - ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See `Seurat::FindClusters()`). In constrast, `IterativeLSI()` utilizes +#' leiden, which should provide the same clustering results while being faster. +#' - ArchR also plots and calculates a umap of every iteration's dimensionality reduction. While this is not implemented in `IterativeLSI()`, +#' one can use the `project()` method with the `iteration` argument set to the desired iteration to get projected data. This can then be fed into `uwot::umap()` +#' - ArchR filters out PCs with a correlation to sequencing depth greater than 0.75. +#' While corr_cutoff is provided as an argument in `IterativeLSI()`, it is set to not removing any PCs by default. +#' - ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles. This is not implemented in `IterativeLSI()`, +#' but can be done as a preprocessing step. +#' @seealso `LSI()` `DimReduction()` `svds()` +#' `cluster_cells_graph()` `select_features_variance()` `select_features_dispersion()` +#' `select_features_mean()` `select_features_binned_dispersion()` +#' @inheritParams LSI +#' @examples +#' ## Prep data +#' nrows <- 350 +#' ncols <- 2000 +#' mat <- matrix(1:(nrows*ncols), nrow = nrows) %>% as("IterableMatrix") +#' rownames(mat) <- paste0("feat", seq(nrows)) +#' colnames(mat) <- paste0("cell", seq(ncols)) +#' +#' +#' ####################################################################### +#' ## IterativeLSI() examples +#' ####################################################################### +#' dim_reduction <- IterativeLSI(mat, n_dimensions = 5) +#' +#' ## Can customize parameters using partialization +#' dim_reduction <- IterativeLSI( +#' mat, +#' n_dimensions = 10, +#' feature_selection_method = select_features_variance( +#' num_feats = 0.5, +#' normalize_method = normalize_tfidf(scale_factor = 5000) +#' ), +#' cluster_method = cluster_cells_graph( +#' graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5), +#' knn_to_graph_method = knn_to_snn_graph +#' ) +#' ) +#' dim_reduction +#' +#' +#' @export +IterativeLSI <- function( + mat, + n_iterations = 2, + feature_selection_method = select_features_variance, + scale_factor = 1e4, + n_dimensions = 50L, + corr_cutoff = 1, + cluster_method = cluster_cells_graph, + threads = 1L, verbose = FALSE +) { + assert_has_package("RcppHNSW") + assert_is_mat(mat) + assert_greater_than_zero(n_iterations) + assert_is_wholenumber(n_iterations) + assert_is_wholenumber(threads) + assert_is(verbose, "logical") + fitted_params <- list( + lsi_method = LSI(n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, scale_factor = scale_factor, threads = threads), + cluster_method = cluster_method, + feature_selection_method = feature_selection_method, + feature_means = matrix_stats(mat, row_stats = "mean", threads = threads)$row_stats["mean",], + iterations = n_iterations, + iter_info = tibble::tibble( + iteration = integer(), + feature_names = list(), + feature_loadings = list(), + pcs_to_keep = list(), + clusters = list() + ) + ) + feature_selection_method <- partial_apply(feature_selection_method, threads = threads, .missing_args_error = FALSE) + if (verbose) log_progress("Starting Iterative LSI") + for (i in seq_len(n_iterations)) { + if (verbose) log_progress(sprintf("Starting Iterative LSI iteration %s of %s", i, n_iterations)) + # run variable feature selection + if (verbose) log_progress("Selecting features") + if (i == 1) { + var_feature_table <- feature_selection_method(mat) + } else { + var_feature_table <- feature_selection_method(pseudobulk_res) + } + variable_features <- var_feature_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature) + if (is.character(variable_features)) { + mat_indices <- which(rownames(mat) %in% variable_features) + } else { + mat_indices <- variable_features + } + if (length(mat_indices) < n_dimensions) { + rlang::abort( + sprintf(paste0( + "Number of variable features selected (%s) is smaller than number of dimensions requested (%s). \n", + "Try setting the num_feats arg in feature_selection_method as a larger value."), + length(mat_indices), n_dimensions + ) + ) + } + # run LSI + if (verbose) log_progress("Running LSI") + + lsi_res_obj <- LSI( + mat = mat[mat_indices, ], + n_dimensions = n_dimensions, + corr_cutoff = corr_cutoff, + scale_factor = scale_factor, + threads = threads, + verbose = verbose + ) + fitted_params$iter_info <- tibble::add_row( + fitted_params$iter_info, iteration = i, + feature_names = list(variable_features), + feature_loadings = list(lsi_res_obj$fitted_params$feature_loadings), + pcs_to_keep = list(lsi_res_obj$fitted_params$pcs_to_keep) + ) + # only cluster + pseudobulk if this isn't the last iteration + if (i == n_iterations) break + + # cluster the LSI results + if (verbose) log_progress("Clustering LSI results") + clustering_res <- lsi_res_obj$cell_embeddings[, lsi_res_obj$fitted_params$pcs_to_keep, drop = FALSE] %>% + partial_apply(cluster_method, threads = threads, .missing_args_error = FALSE)() + fitted_params$iter_info$clusters[[i]] <- clustering_res + # pseudobulk and pass onto next iteration + if (verbose) log_progress("Pseudobulking matrix") + pseudobulk_res <- pseudobulk_matrix(mat, clustering_res, threads = threads) + rownames(pseudobulk_res) <- rownames(mat) + } + if (verbose) log_progress("Finished running Iterative LSI") + res <- DimReduction( + cell_embeddings = lsi_res_obj$cell_embeddings, + fitted_params = fitted_params, + feature_names = rownames(mat) + ) + class(res) <- c("IterativeLSI", class(res)) + return(res) +} +#' @rdname IterativeLSI +#' @param iteration (integer) Which iteration of `IterativeLSI`'s features and loadings to use for projection. +#' @return +#' `project()` Matrix of the projected data of shape `(cells, n_dimensions)`. +#' @inheritParams project +#' @examples +#' ####################################################################### +#' ## project() example +#' ####################################################################### +#' dim(project(dim_reduction, mat)) +#' +#' +#' @export +project.IterativeLSI <- function(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) { + assert_is_mat(mat) + assert_is_wholenumber(threads) + assert_is_wholenumber(iteration) + fitted_params <- x$fitted_params + # Get the desired iteration row of iter_info tibble + assert_true(iteration <= x$fitted_params$iterations) + iter_info <- fitted_params$iter_info[iteration, ] + + # Do a check to make sure that the fitted features all exist in input matrix + if (!is.null(rownames(mat)) && !is.null(x$feature_names)) { + assert_true(all(x$feature_names %in% rownames(mat))) + } + # Subset to variable features + if (is.character(iter_info$feature_names[[1]])) { + mat_indices <- which(rownames(mat) %in% iter_info$feature_names[[1]]) + } else { + mat_indices <- iter_info$feature_names[[1]] + } + mat <- mat[mat_indices,] + # Run LSI + # since we don't hold the LSI object, copy the internal logic from `project.LSI()` + lsi_attr <- attr(x$fitted_params$lsi_method, "args") + + mat <- normalize_tfidf( + mat = mat, + feature_means = fitted_params$feature_means, + scale_factor = lsi_attr$scale_factor, + threads = threads + ) + + feature_loadings <- iter_info$feature_loadings[[1]] + res <- t(mat) %*% feature_loadings + if (length(iter_info$pcs_to_keep[[1]]) != ncol(res)) { + res <- res[, iter_info$pcs_to_keep[[1]], drop = FALSE] + } + return(res) +} + #' Test for marker features #' #' Given a features x cells matrix, perform one-vs-all differential @@ -70,6 +764,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } + #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..4a34ff35 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -294,16 +294,26 @@ setMethod("short_description", "TransformBinarize", function(x) { #' comparison to the threshold is >= (strict_inequality=FALSE) #' or > (strict_inequality=TRUE). #' @return binarized IterableMatrix object +#' @examples +#' set.seed(12345) +#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5) +#' mat +#' mat <- as(mat, "IterableMatrix") +#' +#' +#' mat_binarized <- binarize(mat, threshold=1) +#' mat_binarized +#' as(mat_binarized, "dgCMatrix") #' @export -binarize <- function(mat, threshold=0, strict_inequality=TRUE) { - assert_is(mat, "IterableMatrix") +binarize <- function(mat, threshold = 0, strict_inequality = TRUE) { assert_is(threshold, "numeric") assert_len(threshold, 1) assert_is(strict_inequality, "logical") - if (strict_inequality == TRUE && threshold < 0) + if (strict_inequality && threshold < 0) stop("binarize threshold must be greater than or equal to zero when strict_inequality is TRUE") - if (strict_inequality == FALSE && threshold <= 0) + if (!strict_inequality && threshold <= 0) stop("binarize threshold must be greater than zero when strict_inequality is FALSE") + assert_is(mat, "IterableMatrix") res <- wrapMatrix("TransformBinarize", convert_matrix_type(mat, "double"), @@ -923,3 +933,107 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalization helper functions +#' +#' Apply standard normalizations to a `(features x cells)` counts matrix. +#' +#' @rdname normalize +#' @param mat (IterableMatrix) Counts matrix with dimensions `(features x cells)`. +#' @param scale_factor (numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below). +#' @param threads (integer) Number of threads to use. +#' @returns +#' Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature +#' and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the +#' normalized value \eqn{\tilde{x}_{ij}} is calculated as: +#' +#' - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +#' @details +#' **Passing to `normalize` parameters with non-default arguments** +#' +#' If the `mat` argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. +#' This can be used to customize `normalize` parameters in other single cell functions in BPCells (e.g. `select_features_mean()`). +#' +#' **Related functions from other packages** +#' - `normalize_log`: Corresponds to `Seurat::NormalizeData()` with its default "LogNormalize" method. +#' @examples +#' set.seed(12345) +#' mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5) +#' mat +#' +#' mat <- as(mat, "IterableMatrix") +#' +#' +#' ####################################################################### +#' ## normalize_log() examples +#' ####################################################################### +#' normalize_log(mat) +#' +#' ## normalization functions can also be called with partial arguments +#' partial_log <- normalize_log(scale_factor = 1e5) +#' partial_log +#' +#' partial_log(mat) +#' +#' +#' @export +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { + assert_greater_than_zero(scale_factor) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + assert_is_mat(mat) + read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) +} + + +#' @rdname normalize +#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by (`rowMeans(mat)` by default). +#' If `feature_means` has names and `mat` has row names, match values by name. +#' Otherwise, assume `feature_means` has the same length and ordering as the matrix rows. +#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of `Signac::RunTFIDF()`. This also matches the normalization used within `ArchR::addIterativeLSI()`, but with `binarize = FALSE`. +#' @examples +#' ####################################################################### +#' ## normalize_tfidf() example +#' ####################################################################### +#' normalize_tfidf(mat) +#' +#' +#' @export +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { + assert_greater_than_zero(scale_factor) + assert_is_wholenumber(threads) + if (rlang::is_missing(mat)) return(create_partial()) + assert_is_mat(mat) + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = "mean", col_stats = "mean", threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + assert_true(all(rownames(mat) %in% names(feature_means))) + # subset feature_means to the rownames of mat in the case there is a length mismatch + # but the feature names all exist in feature_means, + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = "mean", threads = threads)$col_stats["mean", ] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) +} \ No newline at end of file diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..b27fea00 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -45,15 +45,91 @@ document_granges <- function( ), intro_noun, bullets) } +# Add current timestamp to a character string. +add_timestamp <- function(msg) { + return(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg)) +} + # Function which prints a message using shell echo. # Useful for printing messages from inside mclapply when running in Rstudio. log_progress <- function(msg, add_timestamp = TRUE){ if (add_timestamp) { - msg <- paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg) + msg <- add_timestamp(msg) } if (.Platform$GUI == "RStudio") { system(sprintf('echo "%s"', paste0(msg, collapse=""))) } else { message(msg) } -} \ No newline at end of file +} + +#' Helper to create partial functions +#' +#' Automatically creates a partial application of the caller +#' function including all non-missing arguments. +#' +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +create_partial <- function() { + env <- rlang::caller_env() + fn_sym <- rlang::caller_call()[[1]] + fn <- rlang::caller_fn() + + args <- list() + for (n in names(formals(fn))) { + if (rlang::is_missing(env[[n]])) next + args[[n]] <- env[[n]] + } + + ret <- do.call(partial_apply, c(fn, args)) + attr(ret, "body")[[1]] <- fn_sym + return(ret) +} + + +#' Create partial function calls +#' +#' Specify some but not all arguments to a function. +#' +#' @param f A function +#' @param ... Named arguments to `f` +#' @param .overwrite (bool) If `f` is already an output from +#' `partial_apply()`, whether parameter re-definitions should +#' be ignored or overwrite the existing definitions +#' @param .missing_args_error (bool) If `TRUE`, passing in arguments +#' that are not in the function's signature will raise an error, otherwise +#' they will be ignored +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) { + args <- rlang::list2(...) + + if (is(f, "bpcells_partial")) { + prev_args <- attr(f, "args") + for (a in names(prev_args)) { + if (!(.overwrite && a %in% names(args))) { + args[[a]] <- prev_args[[a]] + } + } + f <- attr(f, "fn") + function_name <- attr(f, "body")[[1]] + } else { + function_name <- rlang::sym(rlang::caller_arg(f)) + } + # See which arguments do not exist in f + missing_args <- which(!names(args) %in% names(formals(f))) + if (length(missing_args) > 0) { + if (.missing_args_error) { + stop(sprintf("Arguments %s are not in the function signature", paste0(names(args)[missing_args], collapse=", "))) + } else { + args <- args[-missing_args]} + } + partial_fn <- do.call(purrr::partial, c(f, args)) + attr(partial_fn, "body")[[1]] <- function_name + structure( + partial_fn, + class = c("bpcells_partial", "purrr_function_partial", "function"), + args = args, + fn = f + ) +} diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd new file mode 100644 index 00000000..17ee89ce --- /dev/null +++ b/r/man/DimReduction.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{DimReduction} +\alias{DimReduction} +\alias{project} +\title{Barebones definition of a DimReduction class.} +\usage{ +DimReduction( + cell_embeddings, + fitted_params = list(), + feature_names = character(0), + ... +) + +project(x, mat, ...) +} +\arguments{ +\item{x}{DimReduction object.} + +\item{(IterableMatrix)}{Input matrix of shape \verb{(features x cells)}.} +} +\value{ +\itemize{ +\item \code{DimReduction()}: DimReduction object. +} + +\itemize{ +\item \code{project()}: IterableMatrix object of the projected data. +} +} +\description{ +Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +Child classes should implement a \code{project} method to allow for the projection of other matrices using +the fitted transformation object. +} +\details{ +\code{project()}: Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. + +DimReduction subclasses should use the \code{project} method on new data with the same features, to project into the same latent space. +All required information to run a projection should be held in \code{x$fitted_params}, including pertinent parameters when constructing the DimReduction subclass object. + +If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +If there are rownames, reorder the matrix to match the order of the original matrix +} +\section{Fields}{ + +\describe{ +\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) Projected data of shape \verb{(cells x n_dimensions)} of the original matrix after a dimensionality reduction.} + +\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix. This should include all necessary information to project new data with the same features.} + +\item{\code{feature_names}}{(character) The names of the features that this DimReduction object was fit on. Matrices to be projected should have the same feature names.} +}} + diff --git a/r/man/IterativeLSI.Rd b/r/man/IterativeLSI.Rd new file mode 100644 index 00000000..0071ce71 --- /dev/null +++ b/r/man/IterativeLSI.Rd @@ -0,0 +1,159 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{IterativeLSI} +\alias{IterativeLSI} +\alias{project.IterativeLSI} +\title{Run Iterative LSI on a matrix.} +\usage{ +IterativeLSI( + mat, + n_iterations = 2, + feature_selection_method = select_features_variance, + scale_factor = 10000, + n_dimensions = 50L, + corr_cutoff = 1, + cluster_method = cluster_cells_graph, + threads = 1L, + verbose = FALSE +) + +\method{project}{IterativeLSI}(x, mat, iteration = x$fitted_params$iterations, threads = 1L, ...) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.} + +\item{n_iterations}{(int) The number of LSI iterations to perform.} + +\item{feature_selection_method}{(function) Method to use for selecting features for LSI. +Current builtin options are \code{select_features_variance}, \code{select_features_dispersion}, \code{select_features_mean}, \code{select_features_binned_dispersion}} + +\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is greater or equal to +the corr_cutoff, it will be excluded from the final PCA matrix.} + +\item{cluster_method}{(function) Method to use for clustering the post-SVD matrix. +The user can pass in partial parameters to the cluster method, such as by passing +\code{cluster_cells_graph(mat, graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5))} into \code{cluster_method}.} + +\item{threads}{(integer) Number of threads to use. Also gets passed down into \code{feature_selection_method} and \code{cluster_method}} + +\item{x}{DimReduction object.} + +\item{iteration}{(integer) Which iteration of \code{IterativeLSI}'s features and loadings to use for projection.} +} +\value{ +\code{IterativeLSI()} An object of class \code{c("IterativeLSI", "DimReduction")} with the following attributes: +\itemize{ +\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)} +\item \code{feature_names}: The names of features that \code{IterativeLSI()} was fit on. +Note that projection only requires the features used in each specific iteration (as described in \code{iter_info}) +\item \code{fitted_params}: A list of the parameters used for iterative LSI. Includes the following: +\itemize{ +\item \code{lsi_method}: The method used for LSI +\item \code{feature_selection_method}: The method used for selecting features +\item \code{cluster_method}: The method used for clustering +\item \code{feature_means}: The means of the features used for tf-idf normalization +\item \code{iterations}: The number of LSI iterations ran +\item \code{iter_info}: A tibble of iteration info with rows as iterations. Columns include the following: +\itemize{ +\item \code{iteration}: The iteration number +\item \code{feature_names}: The names of the features used for the iteration +\item \code{feature_loadings}: SVD component \code{u} with dimension \verb{(n_dimensions, n_variable_features)} +\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth +\item \code{clusters}: The clusters for the iteration. This is blank for the first iteration +} +} +} + +\code{project()} Matrix of the projected data of shape \verb{(cells, n_dimensions)}. +} +\description{ +Given a \verb{(features x cells)} counts matrix, perform IterativeLSI to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}. +This uses the method described in \href{https://doi.org/10.1038/s41588-021-00790-6}{ArchR} (Granja et al; 2019). +See details for more specific information. Returns a DimReduction object, which allows for projection of matrices with the same features into the same latent space. +} +\details{ +The Iterative LSI method is as follows: +\itemize{ +\item First iteration: +\itemize{ +\item Select features using \code{feature_selection_method} +\item Perform LSI on the selected features +\item If \code{n_iterations} is 1, return the projected data from the first PCA projection +\item Else, cluster the LSI results using \code{cluster_method} +} +\item For each subsequent iteration: +\itemize{ +\item Pseudobulk the clusters from the previous iteration and select the top features based on the pseudobulked clusters +\item Perform LSI on the selected features +\item If this is the final iteration, return the projected data from this PCA projection +\item Else, cluster the LSI results using \code{cluster_method} +} +} + +There are some minor differences when compared to the ArchR implementation: +\itemize{ +\item ArchR binarizes data prior to feature selection. To replicate this, the user can pass \code{select_features_variance(normalize=binarize)} for their \code{feature_selection_method}. +\item \code{IterativeLSI()} currently does not support utilization of different feature selection methods across each iteration. +If one desires to use a different feature selection method for each iteration, they can take the cluster assignments from the previous +iteration and use them to select features and run LSI. +\item ArchR uses a default of 25000 features picked during feature selection. As the number of input features is dependent on the input matrix fed into \code{IterativeLSI()}, +the default for \code{select_features_variance()} instead picks the number of variable features as a proportion of the total features provided. To mimic the ArchR implementation, +\code{feature_selection_method} can be set to \code{select_features_variance(num_feats = 25000)}. +\item ArchR calculates LSI during non-terminal iterations using a default subset of 10000 cells. ArchR does this to prevent a memory bottleneck, +which BPCells does not encounter even with a non-subsetted matrix. Therefore, IterativeLSI will run LSI on the entire matrix for each iteration. +\item ArchR defaults on using Seurat clustering for default, which utilizes the Louvain algorithm (See \code{Seurat::FindClusters()}). In constrast, \code{IterativeLSI()} utilizes +leiden, which should provide the same clustering results while being faster. +\item ArchR also plots and calculates a umap of every iteration's dimensionality reduction. While this is not implemented in \code{IterativeLSI()}, +one can use the \code{project()} method with the \code{iteration} argument set to the desired iteration to get projected data. This can then be fed into \code{uwot::umap()} +\item ArchR filters out PCs with a correlation to sequencing depth greater than 0.75. +While corr_cutoff is provided as an argument in \code{IterativeLSI()}, it is set to not removing any PCs by default. +\item ArchR filters out outliers dependent on number of accesible regions of cells, by the bottom and top quantiles. This is not implemented in \code{IterativeLSI()}, +but can be done as a preprocessing step. +} +} +\examples{ +## Prep data +nrows <- 350 +ncols <- 2000 +mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix") +rownames(mat) <- paste0("feat", seq(nrows)) +colnames(mat) <- paste0("cell", seq(ncols)) + + +####################################################################### +## IterativeLSI() examples +####################################################################### +dim_reduction <- IterativeLSI(mat, n_dimensions = 5) + +## Can customize parameters using partialization +dim_reduction <- IterativeLSI( + mat, + n_dimensions = 10, + feature_selection_method = select_features_variance( + num_feats = 0.5, + normalize_method = normalize_tfidf(scale_factor = 5000) + ), + cluster_method = cluster_cells_graph( + graph_to_cluster_method = cluster_graph_louvain(resolution = 0.5), + knn_to_graph_method = knn_to_snn_graph + ) +) +dim_reduction + + +####################################################################### +## project() example +####################################################################### +dim(project(dim_reduction, mat)) + + +} +\seealso{ +\code{LSI()} \code{DimReduction()} \code{svds()} +\code{cluster_cells_graph()} \code{select_features_variance()} \code{select_features_dispersion()} +\code{select_features_mean()} \code{select_features_binned_dispersion()} +} diff --git a/r/man/LSI.Rd b/r/man/LSI.Rd new file mode 100644 index 00000000..bb28f17f --- /dev/null +++ b/r/man/LSI.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{LSI} +\alias{LSI} +\alias{project.LSI} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +LSI( + mat, + n_dimensions = 50L, + corr_cutoff = 1, + scale_factor = 10000, + threads = 1L, + verbose = FALSE +) + +\method{project}{LSI}(x, mat, threads = 1L, ...) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix of shape \verb{(features x cells)}.} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is greater or equal to +the corr_cutoff, it will be excluded from the final PCA matrix.} + +\item{scale_factor}{(numeric) Scaling factor to multiply matrix by during tf-idf normalization.} + +\item{threads}{(integer) Number of threads to use.} + +\item{x}{DimReduction object.} +} +\value{ +\code{LSI()} An object of class \code{c("LSI", "DimReduction")} with the following attributes: +\itemize{ +\item \code{cell_embeddings}: The projected data as a matrix of shape \verb{(cells, n_dimensions)} +\item \code{fitted_params}: A named list of the parameters used for LSI, including the following: +\itemize{ +\item \code{scale_factor}: The scale factor used for tf-idf normalization +\item \code{feature_means}: The means of the features used for normalization +\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth +\item \code{feature_loadings}: SVD component with dimension \verb{(n_variable_features, n_dimensions)} used to linearly transform input data into cell embeddings +} +\item \code{feature_names}: The names of the features in the matrix +} + +\code{project()} IterableMatrix of the projected data of shape \verb{(n_dimensions, ncol(mat))}. +} +\description{ +Given a \verb{(features x cells)} counts matrix, perform LSI, which sequentially executes tf-idf normalization and PCA to create a latent space representation +of the matrix of shape \verb{(n_dimensions, ncol(mat))}. +Returns a DimReduction object, which allows for projection of new matrices with the same features into the same latent space. +} +\details{ +Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. + +Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +\itemize{ +\item 17.1 MB memory usage, 25.1 seconds runtime +} +} +\examples{ +nrows <- 50 +ncols <- 1000 +mat <- matrix(1:(nrows*ncols), nrow = nrows) \%>\% as("IterableMatrix") +rownames(mat) <- paste0("feat", seq(nrows)) +colnames(mat) <- paste0("cell", seq(ncols)) + + +####################################################################### +## LSI() example +####################################################################### +lsi_result <- LSI(mat, n_dimensions = 10) +lsi_result + + +####################################################################### +## project() example +####################################################################### +dim(project(lsi_result, mat)) + + +} +\seealso{ +\code{project()} \code{DimReduction()} \code{normalize_tfidf()} \code{normalize_log()} \code{svds()} +} diff --git a/r/man/binarize.Rd b/r/man/binarize.Rd index c162a8c8..acb9fefb 100644 --- a/r/man/binarize.Rd +++ b/r/man/binarize.Rd @@ -28,3 +28,14 @@ is set to FALSE, element values greater than or equal to the threshold are set to one. As an alternative, the \code{<}, \code{<=}, \code{>}, and \code{>=} operators are also supported. } +\examples{ +set.seed(12345) +mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5) +mat +mat <- as(mat, "IterableMatrix") + + +mat_binarized <- binarize(mat, threshold=1) +mat_binarized +as(mat_binarized, "dgCMatrix") +} diff --git a/r/man/call_macs_peaks.Rd b/r/man/call_macs_peaks.Rd new file mode 100644 index 00000000..5ad425c5 --- /dev/null +++ b/r/man/call_macs_peaks.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/atac_utils.R +\name{call_macs_peaks} +\alias{call_macs_peaks} +\title{Call peaks using MACS2/3} +\usage{ +call_macs_peaks(...) +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} + +This function has been renamed to \code{call_peaks_macs()} +} +\keyword{internal} diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd new file mode 100644 index 00000000..b99ff5d6 --- /dev/null +++ b/r/man/cluster_cells_graph.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{cluster_cells_graph} +\alias{cluster_cells_graph} +\title{Cluster embeddings using a KNN-Graph based algorithm} +\usage{ +cluster_cells_graph( + mat, + knn_method = knn_hnsw, + knn_to_graph_method = knn_to_geodesic_graph, + graph_to_cluster_method = cluster_graph_leiden, + threads = 0L, + verbose = FALSE +) +} +\arguments{ +\item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}} + +\item{knn_method}{(function) Function to convert cell embeddings into a knn object. +Must be a (optionally partialized) version of \code{knn_hnsw()} or \code{knn_annoy()}.} + +\item{knn_to_graph_method}{(function) Function to convert the knn object returned from \code{knn_method} to a graph adjacency matrix. +Must be a (optionally partialized) version of \code{knn_to_graph()}, \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}.} + +\item{graph_to_cluster_method}{(function) Clustering algorithm that converts a graph adjacency matrix +returned from \code{graph_to_cluster_method} into cluster labels for each cell. +Must be a (optionally partialized) version of \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()} or \code{cluster_graph_seurat()}.} + +\item{threads}{(integer) Number of threads to use in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}. If these functions do not utilize +a \code{threads} argument, this is silently ignored.} + +\item{verbose}{(logical) Whether to print progress information in \code{knn_method}, \code{knn_to_graph_method} and \code{graph_to_cluster_method}. If these functions do not utilize +a \code{verbose} argument, this is silently ignored.} +} +\value{ +(factor) Factor vector containing the cluster assignment for each cell. +} +\description{ +Take in a cell embedding matrix, and sequentially convert it into a kNN object, then to +a graph adjacency matrix. Following, assign a label to every cell using a clustering algorithm. +} +\details{ +\code{cluster_cells_graph()} acts as a helper function to wrap input creation and \code{kNN} graph adjacency-based clustering to be done together. The user +can also manually pass cell embeddings to their preferred knn/clustering functions of choices. + +\strong{Clustering customization through partialized parameters} + +Customization of clustering is possible through partialization of each parameter in \code{cluster_cells_graph()} that is a function. +In detail, each parameter that requests a function +may take in one with only some of the arguments provided. If the first argument is not provided, a copy of a function is utilized that has its parameters +changed with the arguments provided. + +For instance, if the user desires for \code{cluster_cells_graph()} to instead use \code{cluster_graph_louvain()} with resolution different than the default, +they can instead call \code{cluster_cells_graph()} like so: +\code{cluster_cells_graph(mat, graph_to_cluster_method = cluter_graph_louvain(resolution = 0.5))} +} +\seealso{ +\code{knn_hnsw()} \code{knn_annoy()} \code{knn_to_graph()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} \code{cluster_graph_leiden()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} +} diff --git a/r/man/cluster.Rd b/r/man/cluster_graph.Rd similarity index 88% rename from r/man/cluster.Rd rename to r/man/cluster_graph.Rd index ce47c1db..0794ca6c 100644 --- a/r/man/cluster.Rd +++ b/r/man/cluster_graph.Rd @@ -7,19 +7,19 @@ \title{Cluster an adjacency matrix} \usage{ cluster_graph_leiden( - snn, + mat, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ... ) -cluster_graph_louvain(snn, resolution = 1, seed = 12531) +cluster_graph_louvain(mat, resolution = 1, seed = 12531) -cluster_graph_seurat(snn, resolution = 0.8, ...) +cluster_graph_seurat(mat, resolution = 0.8, ...) } \arguments{ -\item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used} +\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used.} \item{resolution}{Resolution parameter. Higher values result in more clusters} diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd new file mode 100644 index 00000000..b8bc6939 --- /dev/null +++ b/r/man/create_partial.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{create_partial} +\alias{create_partial} +\title{Helper to create partial functions} +\usage{ +create_partial() +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Automatically creates a partial application of the caller +function including all non-missing arguments. +} +\keyword{internal} diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd new file mode 100644 index 00000000..bfb9ebb3 --- /dev/null +++ b/r/man/feature_selection.Rd @@ -0,0 +1,159 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_variance} +\alias{select_features_variance} +\alias{select_features_dispersion} +\alias{select_features_mean} +\alias{select_features_binned_dispersion} +\title{Feature selection functions} +\usage{ +select_features_variance( + mat, + num_feats = 0.05, + normalize_method = NULL, + threads = 1L +) + +select_features_dispersion( + mat, + num_feats = 0.05, + normalize_method = NULL, + threads = 1L +) + +select_features_mean( + mat, + num_feats = 0.05, + normalize_method = NULL, + threads = 1L +) + +select_features_binned_dispersion( + mat, + num_feats = 0.05, + n_bins = 20, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.} + +\item{num_feats}{(float) Number of features to mark as highly_variable. If 0 < \code{num_feats} < 1, then interpret as a fraction of features.} + +\item{normalize_method}{(function) Used to normalize the matrix prior to feature selection by calling \code{normalize_method(mat)} if it is not \code{NULL}. +For example, pass \code{normalize_log()} or \code{normalize_tfidf()}.} + +\item{threads}{(integer) Number of threads to use. Also overrides the threads argument in \code{normalize_method}} + +\item{n_bins}{(integer) Number of bins to split features into in order to control for the relationship between mean expression and dispersion (see details).} +} +\value{ +Return a dataframe with the following columns: +\itemize{ +\item \code{feature}: Feature name. +\item \code{score}: Scoring of the feature, depending on the method used. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} + +Each different feature selection method will have a different scoring method. +Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature +and the column index \eqn{j} refers to each cell. For each feature \eqn{x_{i} \in X}, we define the following feature-selection scores: +\itemize{ +\item \code{select_features_variance}: \eqn{\mathrm{Score}(x_i) = \frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2} +} + +\itemize{ +\item \code{select_features_dispersion}: \eqn{\mathrm{Score}(x_i) = \frac{\frac{1}{n - 1} \sum_{j=1}^{n} \bigl(x_{ij} - \bar{x}_i\bigr)^2}{\bar{x}_i}} +} + +\itemize{ +\item \code{select_features_mean}: \eqn{\mathrm{Score}(x_i) = \frac{\sum_{j=1}^{n}\bigl(x_{ij}\bigr)}{n}} +} + +\itemize{ +\item \code{select_features_binned_dispersion}: Process described in \code{details}. +} +} +\description{ +Apply a feature selection method to a non-normalized \verb{(features x cells)} matrix. We recommend using counts matrices as input and to +apply any normalizations prior to feature selection via the normalize argument (if available). +Instead of directly subsetting the input matrix, +an output dataframe is returned, indicating which features are highly variable, and the scoring of each feature. +} +\details{ +\code{select_features_binned_dispersion} implements the approach from Satija et al. 2015: +\enumerate{ +\item Bin features into equal-width bins by \code{log1p(mean)} +\item Calculate dispersion of each feature as \code{log(variance / mean)} +\item Z-score normalize dispersion within each bin, and select highest normalized dispersion across all bins +} + +If the number of features within a bin is equal to 1, then the mean dispersion for that bin is set to 1. + +This should be equivalent to \code{Seurat::FindVariableFeatures()} with \code{selection.method="mean.var.plot"} +and \code{scanpy.pp.highly_variable_genes()} with \code{flavor="seurat"}. +} +\examples{ +## Prep data +set.seed(12345) +mat <- matrix(rpois(5*8, lambda=1), nrow=5, ncol=8) +rownames(mat) <- paste0("gene", seq_len(nrow(mat))) +mat +mat <- as(mat, "IterableMatrix") + + +####################################################################### +## select_features_variance() examples +####################################################################### +select_features_variance( + mat, + num_feats=2, + normalize_method=normalize_log +) + +# Because of how the BPCells `normalize` functions behave when the matrix +# argument is missing, we can also customize the normalization parameters +# using partial arguments: +variable_features <- select_features_variance( + mat, + num_feats=2, + normalize_method=normalize_log(scale_factor=20) +) +# One can then filter to only variable features using the subset operator: +mat[variable_features$feature[variable_features$highly_variable],] + + +####################################################################### +## select_features_dispersion() example +####################################################################### +select_features_dispersion( + mat, + num_feats = 2, + normalize_method = normalize_log +) + + +####################################################################### +## select_features_mean() example +####################################################################### +select_features_mean( + mat, + num_feats = 1, + normalize_method = normalize_log +) + + +####################################################################### +## select_features_binned_dispersion() example +####################################################################### +select_features_binned_dispersion( + mat, + num_feats = 2, + n_bins = 2 +) + + +} +\seealso{ +\code{normalize_tfidf()} \code{normalize_log()} +} diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd new file mode 100644 index 00000000..c44ce669 --- /dev/null +++ b/r/man/is_adjacency_matrix.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{is_adjacency_matrix} +\alias{is_adjacency_matrix} +\title{Check if an input is a graph adjacency matrix.} +\usage{ +is_adjacency_matrix(mat) +} +\value{ +TRUE if the mat is a graph adjacency matrix, FALSE otherwise +} +\description{ +Clustering functions like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input. +We assume that any square \code{dgCMatrix} is a graph adjacency matrix. +} +\keyword{internal} diff --git a/r/man/is_knn_object.Rd b/r/man/is_knn_object.Rd new file mode 100644 index 00000000..5713e23f --- /dev/null +++ b/r/man/is_knn_object.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{is_knn_object} +\alias{is_knn_object} +\title{Check if an input is a kNN object} +\usage{ +is_knn_object(mat) +} +\value{ +TRUE if the mat is a knn object, FALSE otherwise +} +\description{ +knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}. +These are used as inputs to create graph adjacency matrices for clustering. +Assume any list with at least both \code{idx} and \code{dist} items is a kNN object. +} +\keyword{internal} diff --git a/r/man/knn.Rd b/r/man/knn.Rd index 3efbc9e7..7944fc7a 100644 --- a/r/man/knn.Rd +++ b/r/man/knn.Rd @@ -3,7 +3,7 @@ \name{knn_hnsw} \alias{knn_hnsw} \alias{knn_annoy} -\title{Get a knn matrix from reduced dimensions} +\title{Get a knn object from reduced dimensions} \usage{ knn_hnsw( data, @@ -17,7 +17,7 @@ knn_hnsw( knn_annoy( data, - query = data, + query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, @@ -38,7 +38,7 @@ knn_annoy( \item{threads}{Number of threads to use. Note that result is non-deterministic if threads > 1} -\item{ef}{ef parameter for RccppHNSW::hnsw_search. Increase for slower search but +\item{ef}{ef parameter for \code{RcppHNSW::hnsw_search()}. Increase for slower search but improved accuracy} \item{n_trees}{Number of trees during index build time. More trees gives higher accuracy} diff --git a/r/man/normalize.Rd b/r/man/normalize.Rd new file mode 100644 index 00000000..c270cb09 --- /dev/null +++ b/r/man/normalize.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\alias{normalize_tfidf} +\title{Normalization helper functions} +\usage{ +normalize_log(mat, scale_factor = 10000, threads = 1L) + +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) Counts matrix with dimensions \verb{(features x cells)}.} + +\item{scale_factor}{(numeric) Scaling factor to multiply matrix by prior to normalization (see formulas below).} + +\item{threads}{(integer) Number of threads to use.} + +\item{feature_means}{(numeric, optional) Pre-calculated means of the features to normalize by (\code{rowMeans(mat)} by default). +If \code{feature_means} has names and \code{mat} has row names, match values by name. +Otherwise, assume \code{feature_means} has the same length and ordering as the matrix rows.} +} +\value{ +Consider a matrix \eqn{X}, where the row index \eqn{i} refers to each feature +and the column index \eqn{j} refers to each cell. For each element \eqn{{x}_{ij} \in X}, the +normalized value \eqn{\tilde{x}_{ij}} is calculated as: +\itemize{ +\item \code{normalize_log}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} +} + +\itemize{ +\item \code{normalize_tfidf}: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +} +} +\description{ +Apply standard normalizations to a \verb{(features x cells)} counts matrix. +} +\details{ +\strong{Passing to \code{normalize} parameters with non-default arguments} + +If the \code{mat} argument is missing, returns a "partial" function: a copy of the original function but with most arguments pre-defined. +This can be used to customize \code{normalize} parameters in other single cell functions in BPCells (e.g. \code{select_features_mean()}). + +\strong{Related functions from other packages} +\itemize{ +\item \code{normalize_log}: Corresponds to \code{Seurat::NormalizeData()} with its default "LogNormalize" method. +} + +\itemize{ +\item \code{normalize_tfidf}: This follows the formula from Stuart, Butler et al. 2019, matching the default behavior of \code{Signac::RunTFIDF()}. This also matches the normalization used within \code{ArchR::addIterativeLSI()}, but with \code{binarize = FALSE}. +} +} +\examples{ +set.seed(12345) +mat <- matrix(rpois(4*5, lambda=1), nrow=4, ncol=5) +mat + +mat <- as(mat, "IterableMatrix") + + +####################################################################### +## normalize_log() examples +####################################################################### +normalize_log(mat) + +## normalization functions can also be called with partial arguments +partial_log <- normalize_log(scale_factor = 1e5) +partial_log + +partial_log(mat) + + +####################################################################### +## normalize_tfidf() example +####################################################################### +normalize_tfidf(mat) + + +} diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd new file mode 100644 index 00000000..149eb19f --- /dev/null +++ b/r/man/partial_apply.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{partial_apply} +\alias{partial_apply} +\title{Create partial function calls} +\usage{ +partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) +} +\arguments{ +\item{f}{A function} + +\item{...}{Named arguments to \code{f}} + +\item{.overwrite}{(bool) If \code{f} is already an output from +\code{partial_apply()}, whether parameter re-definitions should +be ignored or overwrite the existing definitions} + +\item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments +that are not in the function's signature will raise an error, otherwise +they will be ignored} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Specify some but not all arguments to a function. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..612d817f 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | @@ -130,20 +135,24 @@ reference: - IterableMatrix-methods - pseudobulk_matrix -- title: "Reference Annotations" +- title: "Single-cell analysis helpers" +- subtitle: "Normalization and Feature Selection" - contents: - - human_gene_mapping - - match_gene_symbol - - read_gtf - - read_bed - - read_ucsc_chrom_sizes - -- title: "Clustering" + - normalize_log + - select_features_variance +- subtitle: "Clustering" - contents: + - cluster_cells_graph - knn_hnsw - cluster_graph_leiden - knn_to_graph - cluster_membership_matrix +- subtitle: "Dimensionality Reductions" +- contents: + - DimReduction + - project + - LSI + - IterativeLSI - title: "Plots" @@ -176,3 +185,11 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Reference Annotations" +- contents: + - human_gene_mapping + - match_gene_symbol + - read_gtf + - read_bed + - read_ucsc_chrom_sizes diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R new file mode 100644 index 00000000..de775f4f --- /dev/null +++ b/r/tests/real_data/ArchR_LSI.R @@ -0,0 +1,69 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") +library("ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. +#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. +#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix +#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation +#' we can compare between the two packages. +#' @param proj An archr project. +test_lsi_similarity_to_archr <- function(dir = NULL) { + dir <- create_temp_dir(dir) + setwd(dir) + # add iterative lsi for dim reduction + proj <- getTestProject() + proj <- addPeakMatrix(proj) + # Get the peak matrix + test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix")) + # Calculate LSI on ArchR + # running LSI without binarizing, as we don't do this in the BPCells implementation + # we also don't filter quantile outliers. + lsi_archr <- ArchR:::.computeLSI( + mat = test_mat, + LSIMethod = 2, + nDimensions = 20, + binarize = FALSE, + outlierQuantiles = NULL + ) + svd_archr <- lsi_archr$svd + lsi_mat_archr <- t(lsi_archr$matSVD) + # set rownames to NA, as we don't have rownames in the BPCells implementation + rownames(lsi_mat_archr) <- NULL + # PCA Matrix = T(u) * Pre-SVD Matrix + # u * PCA Matrix = u * T(u) * Pre-SVD Matrix + # u * PCA Matrix = Pre-SVD Matrix + pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr + # Calculate LSI on BPCells + # Do not use z-score normalization, as this isn't done with ArchR + lsi_bpcells <- LSI( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + n_dimensions = 20 + ) + pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) + # convert signs + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`) + # Check for post-pca matrix similarity + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4)) + # also check for correlation between the two matrices in PC space + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999) +} +test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 150939d4..8818d7e7 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -1,5 +1,23 @@ -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py new file mode 100644 index 00000000..6ae28c1b --- /dev/null +++ b/r/tests/real_data/Scanpy_variable_feat_selection.py @@ -0,0 +1,46 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import sys, tempfile, os +import numpy as np +import pandas as pd +import scanpy as sc + + +def call_highly_var_genes_single_batch(temp_dir: str) -> None: + """ + Call highly_variable genes on a single batch of PBMC68k data using their interpreation of + the Seurat implementation. + Write the input anndata object csv at `/highly_var_genes_scanpy_input.csv` + Write the output as a csv, at `/highly_var_genes_scanpy_output.csv` + + Args: + temp_dir (str): Path to the temporary directory to write the input and output files. + """ + # Dataset is only (765, 700) + adata = sc.datasets.pbmc68k_reduced() + adata.var_names_make_unique() + res = sc.pp._highly_variable_genes.highly_variable_genes(adata, + n_top_genes = 50, + n_bins = 20, + flavor = "seurat", + inplace = False, + check_values = False).drop(columns = 'mean_bin') + # remove mean_bin + adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv")) + res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv")) + + +if __name__ == "__main__": + # We use the first argument as the temporary directory + if len(sys.argv) < 2: + # If no argument is provided, use the current directory + call_highly_var_genes_single_batch(".") + else: + call_highly_var_genes_single_batch(sys.argv[1]) + \ No newline at end of file diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R new file mode 100644 index 00000000..d2eb08b2 --- /dev/null +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -0,0 +1,58 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +# Compare the feature selection output of BPCells to that of Scanpy. +# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. +# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. +# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. +compare_feat_selection_to_scanpy <- function(dir = NULL) { + dir <- create_temp_dir(dir) + + # Call python script + system2("python3", c("Scanpy_variable_feat_selection.py", dir)) + + # read in input csv + input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) + output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1) + # filter output mat to only where "highly_variable" is true + output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable) + output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% + dplyr::arrange(desc(dispersions_norm)) %>% + dplyr::select(-highly_variable) %>% # convert rownames to a column + tibble::rownames_to_column("name") %>% + dplyr::as_tibble() + + # Scanpy undoes a log1p transformation on the input matrix, so we do the same here + input_mat_bpcells <- expm1(input_mat_scanpy) + + output_bpcells <- highly_variable_features( + input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), + num_feats = 50, + n_bins = 20, + save_feat_selection = TRUE + ) + output_mat_bpcells <- output_bpcells$feature_selection + expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name)) + expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6)) +} + +compare_feat_selection_to_scanpy() diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R index b17723c2..d5a692c0 100644 --- a/r/tests/testthat/test-clustering.R +++ b/r/tests/testthat/test-clustering.R @@ -19,11 +19,10 @@ test_that("C++ SNN calculation works",{ neighbor_sim[i,] <- sample.int(cells, k) } } - + input <- list(idx = neighbor_sim, dist = matrix(runif(cells*k), nrow=cells)) min_val <- 1/15 - snn <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=TRUE) - - mat <- knn_to_graph(list(idx=neighbor_sim), use_weights=FALSE) + snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE) + mat <- knn_to_graph(input, use_weights=FALSE) mat <- mat %*% t(mat) mat <- mat / (2 * k - mat) @@ -35,7 +34,7 @@ test_that("C++ SNN calculation works",{ snn, as(mat, "dgCMatrix") ) - snn2 <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=FALSE) + snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE) diag(mat) <- 0 mat <- Matrix::drop0(mat) expect_identical( @@ -73,4 +72,23 @@ test_that("igraph clustering doesn't crash", { }) expect_no_condition(cluster_graph_louvain(graph)) +}) + +test_that("cluster_cells_graph works", { + skip_if_not_installed("RcppAnnoy") + skip_if_not_installed("RcppHNSW") + mat <- matrix(sample.int(1000, 10000, replace=TRUE), nrow=1000) + # check with default params + res <- expect_no_error(cluster_cells_graph(mat)) + # check with threads, method partialization + expect_true(class(res) == "factor") + expect_equal(nrow(mat), length(res)) + res_partialized <- expect_no_error( + cluster_cells_graph( + mat, knn_method = knn_annoy(k = 9), + knn_to_graph_method = knn_to_snn_graph(min_val = 1/10), + graph_to_cluster_method = cluster_graph_louvain(resolution = 0.8), + )) + expect_true(class(res) == "factor") + expect_equal(nrow(mat), length(res)) }) \ No newline at end of file diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R new file mode 100644 index 00000000..355a5d29 --- /dev/null +++ b/r/tests/testthat/test-errorChecking.R @@ -0,0 +1,19 @@ +# Copyright 2025 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +test_that("assert_is_mat works", { + mat <- matrix(1:4, nrow = 2) + mat_dgc <- as(mat, "dgCMatrix") + mat_iterable <- as(mat, "IterableMatrix") + expect_no_error(assert_is_mat(mat)) + expect_no_error(assert_is_mat(mat_dgc)) + expect_error(assert_is_mat(c(mat_iterable, mat_iterable))) + expect_error(assert_is_mat("a")) + expect_error(assert_is_mat(c("a", "a"))) + expect_error(assert_is_mat(1)) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..a8655375 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,61 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + # Check when row means has no row names + row_means_no_names <- unname(row_means) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + res_with_row_means_partial <- normalize_tfidf(feature_means = row_means)(m2) + expect_equal(res_with_row_means, res_with_row_means_partial) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) + + # Check cases where names exists in either row means xor rownames(mat) + # check where both don't have names + res_with_unnamed_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names) + expect_identical(res_with_unnamed_row_means, res) + rownames(m2) <- NULL + res_with_unnamed_row_names <- normalize_tfidf(m2, feature_means = row_means) + res_with_unnamed_row_names_row_means <- normalize_tfidf(m2, feature_means = row_means_no_names) + rownames(res) <- NULL + expect_identical(as(res_with_unnamed_row_names, "dgCMatrix"), as(res, "dgCMatrix")) + expect_identical(as(res_with_unnamed_row_names_row_means, "dgCMatrix"), as(res, "dgCMatrix")) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + res_2_partial <- as(normalize_log(scale_factor = 1e5)(m2), "dgCMatrix") + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) +}) \ No newline at end of file diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 85ad0be3..aa941d3c 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -1,4 +1,4 @@ -# Copyright 2023 BPCells contributors +# Copyright 2025 BPCells contributors # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -13,6 +13,39 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) +} + +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) +} + +test_that("select_features works general case", { + m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") + for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) { + res <- do.call(fn, list(m1, num_feats = 5)) + expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting + expect_equal(sum(res$highly_variable), 5) # Only 10 features marked as highly variable + expect_setequal(res$feature, rownames(m1)) + res_more_feats_than_rows <- suppressWarnings(do.call(fn, list(m1, num_feats = 10000))) # more features than rows + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + res_feats_partial <- get(fn)(num_feats = 100)(m1) + expect_identical(res_feats_equal_rows, res_feats_partial) + expect_identical(res_more_feats_than_rows, res_feats_equal_rows) + if (fn == "select_features_variance") { + # Check that normalization actually does something + res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize_method = NULL)) + # Check that we can do partial functions on normalization too + res_norm_partial <- do.call(fn, list(m1, num_feats = 10, normalize_method = normalize_log(scale = 1e3, threads = 1L))) + res_norm_implicit_partial <- select_features_variance(normalize_method = normalize_log(scale_factor = 1e3), num_feats = 10)(m1) + expect_identical(res_norm_partial, res_norm_implicit_partial) + expect_true(!all((res_no_norm %>% dplyr::arrange(feature))$score == (res_norm_partial %>% dplyr::arrange(feature))$score)) + } + } +}) + + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -161,4 +194,84 @@ test_that("Pseudobulk aggregation works with multiple return types", { } } } +}) + + + +test_that("Feature selection by bin variance works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1) + res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1) + res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature) + res <- mat[res_feats,] + res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(feature) + res_t <- t(mat[,res_feats_t]) + + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) + expect_equal(nrow(res_t), 10) + expect_equal(ncol(res_t), 500) +}) + +test_that("LSI works", { + set.seed(12345) + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR + n_dimensions <- 5 + lsi_res_obj <- LSI(mat, n_dimensions = n_dimensions) + lsi_res_t_obj <- LSI(t(mat), n_dimensions = n_dimensions) + lsi_res <- lsi_res_obj$cell_embeddings + lsi_res_t <- lsi_res_t_obj$cell_embeddings + # Check that projection results in the same output if used on the same input matrix + lsi_res_proj <- project(lsi_res_obj, mat) + # Check setting pca correlations to non-1 value + lsi_res_obj_corr <- LSI(mat, n_dimensions = n_dimensions, corr_cutoff = 0.2) + + expect_equal(ncol(lsi_res), 5) + expect_equal(nrow(lsi_res), ncol(mat)) + expect_equal(ncol(lsi_res_t), 5) + expect_equal(nrow(lsi_res_t), nrow(mat)) + expect_equal(nrow(lsi_res_proj), ncol(mat)) + expect_lt(ncol(lsi_res_obj_corr$cell_embeddings), n_dimensions) + expect_equal(lsi_res, lsi_res_proj, tolerance = 1e-7) +}) + +test_that("Iterative LSI works", { + skip_if_not_installed("RcppHNSW") + mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + lsi_res_obj <- expect_no_error(IterativeLSI(mat, n_iterations = 2, n_dimensions = 10L, cluster_method = cluster_cells_graph(knn_method = knn_annoy))) + lsi_res_proj <- project(lsi_res_obj, mat) + lsi_res_proj_iter_1 <- expect_no_error(project(lsi_res_obj, mat, iteration = 1L)) + lsi_res_embedding <- lsi_res_obj$cell_embeddings + expect_equal(nrow(lsi_res_embedding), ncol(mat)) + expect_equal(ncol(lsi_res_embedding), 10) + expect_equal(nrow(lsi_res_proj_iter_1), ncol(mat)) + expect_equal(ncol(lsi_res_proj_iter_1), 10) + expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e-7) +}) + +test_that("Iterative LSI works with parameterized clustering", { + skip_if_not_installed("RcppAnnoy") + mat <- matrix(data = runif(50000, 0, 1), nrow=500, ncol = 100) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + lsi_res_obj <- expect_no_error( + IterativeLSI( + mat, n_dimensions = 10L, + cluster_method = cluster_cells_graph( + knn_method = knn_annoy(k = 12), + knn_to_graph_method = knn_to_snn_graph(min_val = 0.1) + ) + ) + ) + lsi_res_proj <- project(lsi_res_obj, mat) + lsi_res_embedding <- lsi_res_obj$cell_embeddings + expect_equal(nrow(lsi_res_embedding), ncol(mat)) + expect_equal(ncol(lsi_res_embedding), 10) + expect_equal(lsi_res_embedding, lsi_res_proj, tolerance = 1e7) }) \ No newline at end of file