From d28c0e9e615008e347684a33407597a90bfa50dc Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Mon, 28 Jul 2025 13:43:20 -0700 Subject: [PATCH 1/3] [r] add clustering + partialized function changes --- r/DESCRIPTION | 4 +- r/NAMESPACE | 1 + r/NEWS.md | 6 + r/R/clustering.R | 184 +++++++++++++++++++++---- r/R/errorChecking.R | 7 + r/R/utils.R | 78 ++++++++++- r/man/cluster_cells_graph.Rd | 78 +++++++++++ r/man/{cluster.Rd => cluster_graph.Rd} | 8 +- r/man/create_partial.Rd | 16 +++ r/man/is_adjacency_matrix.Rd | 16 +++ r/man/is_knn_object.Rd | 17 +++ r/man/knn.Rd | 20 ++- r/man/partial_apply.Rd | 28 ++++ r/pkgdown/_pkgdown.yml | 1 + r/tests/testthat/test-clustering.R | 30 +++- r/tests/testthat/test-errorChecking.R | 19 +++ 16 files changed, 467 insertions(+), 46 deletions(-) create mode 100644 r/man/cluster_cells_graph.Rd rename r/man/{cluster.Rd => cluster_graph.Rd} (88%) create mode 100644 r/man/create_partial.Rd create mode 100644 r/man/is_adjacency_matrix.Rd create mode 100644 r/man/is_knn_object.Rd create mode 100644 r/man/partial_apply.Rd create mode 100644 r/tests/testthat/test-errorChecking.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 7fada48d..3dd6ad3f 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -49,7 +49,9 @@ Suggests: IRanges, GenomicRanges, matrixStats, - igraph + igraph, + RcppHNSW, + RcppAnnoy Depends: R (>= 4.0.0) Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs, png, magrittr diff --git a/r/NAMESPACE b/r/NAMESPACE index 29a7cdcc..a7f473cf 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -21,6 +21,7 @@ export(canonical_gene_symbol) export(cellNames) export(checksum) export(chrNames) +export(cluster_cells_graph) export(cluster_graph_leiden) export(cluster_graph_louvain) export(cluster_graph_seurat) diff --git a/r/NEWS.md b/r/NEWS.md index 4c9e6988..f5998f88 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -1,3 +1,9 @@ +# BPCells 0.4.0 (in-progress main branch) + +## Breaking changes +- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type. (pull request #189) + + # BPCells 0.3.1 (7/21/2025) The BPCells 0.3.1 release covers 7 months of changes and 40 commits from 5 contributors. Notable changes include writing matrices in AnnData's dense format, diff --git a/r/R/clustering.R b/r/R/clustering.R index b53a4edf..9b1dd58d 100644 --- a/r/R/clustering.R +++ b/r/R/clustering.R @@ -6,6 +6,103 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +#' Check if an input is a kNN object +#' +#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`. +#' These are used as inputs to create graph adjacency matrices for clustering. +#' Assume any list with at least both `idx` and `dist` items is a kNN object. +#' @return TRUE if the mat is a knn object, FALSE otherwise +#' @keywords internal +is_knn_object <- function(mat) { + return(is(mat, "list") && all(c("idx", "dist") %in% names(mat))) +} + +#' Check if an input is a graph adjacency matrix. +#' +#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input. +#' We assume that any square `dgCMatrix` is a graph adjacency matrix. +#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise +#' @keywords internal +is_adjacency_matrix <- function(mat) { + return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat)) +} + +#' Cluster cell embeddings using a KNN graph-based algorithm +#' +#' Take in a cell embedding matrix, then find k nearest neighbors (KNN) for each cell, convert the +#' KNN into a graph (adjacency matrix), then run a graph-based clustering algorithm. Each of these +#' steps can be customized by passing a function that performs the step (see details). +#' +#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)` +#' @param knn_method (function) Function to that takes an embedding matrix as the first argument and returns a k nearest neighbors (KNN) object. +#' For example, `knn_hnsw()`, `knn_annoy()`, or a parameterized version (see Details). +#' @param knn_to_graph_method (function) Function that takes a KNN object and returns a graph as an undirected graph (lower-triangular dgCMatrix adjacency matrix). +#' For example, `knn_to_graph()`, `knn_to_snn_graph()`, `knn_to_geodesic_graph()`, or a parameterized version (see Details). +#' @param cluster_graph_method (function) Function that takes an undirected graph of cell similarity and returns a factor with cluster assignments for each cell. +#' For example, `cluster_graph_leiden()`, `cluster_graph_louvain()`, `cluster_graph_seurat()`, or a parameterized version (see Details). +#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `cluster_graph_method`. If these functions do not utilize +#' a `threads` argument, this is silently ignored. +#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `cluster_graph_method`. If these functions do not utilize +#' a `verbose` argument, this is silently ignored. +#' @returns (factor) Factor vector containing the cluster assignment for each cell. +#' @details +#' +#' **Customizing clustering steps** +#' +#' All of the BPCells functions named like `knn_*`, `knn_to_graph_*`, and `cluster_graph_*` support customizing parameters +#' via partial function application. For example, look for 20 neighbors during the k nearest neighbors search, setting +#' `knn_method=knn_hnsw(k=20)` is a convenient shortcut for `knn_method=function(x) knn_hnsw(x, k=20)`. Similarly, lowering the +#' default clustering resolution can be done as `cluster_graph_method=cluster_graph_louvain(resolution=0.5)`. This works because +#' all these functions are written to return a partially parameterized copy of themselves as a function object when their +#' first argument is missing. +#' +#' For even more advanced customization, users can manually call the `knn`, `knn_to_graph`, and `cluster_graph` methods rather +#' than using `cluster_cells_graph()` as a convenient wrapper. +#' +#' **Implementing custom clustering steps** +#' +#' The required interfaces for each step are as follows: +#' +#' **knn_method**: First argument is a matrix of cell embeddings, shape `(cells x n_embeddings)`. +#' Returns a named list of two matrices of dimension (cells x k): +#' - `idx`: Neighbor indices, where `idx[c, n]` is the index of the +#' nth nearest neighbor to cell `c`. +#' - `dist`: Neighbor distances, where `dist[c, n]` is the distance between +#' cell `c` and its nth nearest neighbor. +#' Self-neighbors are allowed, so with sufficient search effort `idx[c,1] == c` for nearly all cells. +#' +#' **knn_to_graph_method**: First argument is a KNN object as returned by `knn_method`. +#' Returns a weighted similarity graph as a lower triangular sparse adjacency matrix (`dgCMatrix`). +#' For cells `i` and `j`, their similarity score is in `adjacency_mat[max(i,j), min(i,j)]`. +#' +#' **cluster_graph_method**: First argument is a weighted similarity graph as returned by `knn_to_graph_method`. +#' Returns a factor vector of length `cells` with a cluster assignment for each cell. +#' +#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` +#' @export +cluster_cells_graph <- function( + mat, knn_method = knn_hnsw, + knn_to_graph_method = knn_to_geodesic_graph, + cluster_graph_method = cluster_graph_leiden, + threads = 0L, verbose = FALSE +) { + assert_is_wholenumber(threads) + assert_is(verbose, "logical") + if (rlang::is_missing(mat)) return(create_partial()) + assert_is(mat, "matrix") + # There currently aren't any `knn_to_graph` functions that utilize a verbose argument. + # However, we still pass `verbose` in case future functions do provide this functionality. + mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat) + if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1) + # Return type has to be constrained to "matrix", so this is silently provided. + mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat) + if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1) + # Also pass verbose and threads to clustering functions in case they are given these params in the future + mat <- partial_apply(cluster_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat) + return(mat) +} + #' K Nearest Neighbor (KNN) Graph #' #' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into @@ -22,6 +119,11 @@ #' Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to #' cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i` knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { + assert_is(use_weights, "logical") + assert_is(self_loops, "logical") + + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) if (use_weights) { weights <- knn$dist } else { @@ -41,7 +143,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { mat } - #' @rdname knn_graph #' @details **knn_to_snn_graph** #' Convert a knn object into a shared nearest neighbors adjacency matrix. @@ -63,6 +164,9 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) { #' @export knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) { return_type <- match.arg(return_type) + assert_is(self_loops, "logical") + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val) min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val)) snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int) @@ -83,13 +187,15 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t } # Return as a sparse matrix - Matrix::sparseMatrix( + res <- Matrix::sparseMatrix( i = snn$i + 1L, j = snn$j + 1L, x = snn$weight, dims = c(snn$dim, snn$dim) ) + return(res) } #' @rdname knn_graph +#' @param threads Number of threads to use during calculations #' @details **knn_to_geodesic_graph** #' Convert a knn object into an undirected weighted graph, using the same #' geodesic distance estimation method as the UMAP package. @@ -101,7 +207,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t #' neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which #' assumes self is always successfully found in the approximate nearest neighbor search. #' -#' @param threads Number of threads to use during calculations #' @return **knn_to_geodesic_graph** #' - `return_type == "matrix"`: #' Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`. @@ -111,29 +216,31 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t #' These correspond to the rows, cols, and values of non-zero entries in the lower triangle #' adjacency matrix. `dim` is the total number of vertices (cells) in the graph #' @export -knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) { +knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) { return_type <- match.arg(return_type) + assert_is_wholenumber(threads) + if (rlang::is_missing(knn)) return(create_partial()) + assert_true(is_knn_object(knn)) graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads) graph$dim <- nrow(knn$idx) - if (return_type == "list") { - return(graph) - } + if (return_type == "list") return(graph) # Return as a sparse matrix - Matrix::sparseMatrix( + res <- Matrix::sparseMatrix( i = graph$i + 1L, j = graph$j + 1L, x = graph$weight, dims = c(graph$dim, graph$dim) ) + return(res) } #' Cluster an adjacency matrix -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. #' Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`, #' so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a #' good default when `objective_function = "modularity"` per the default. -#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used +#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used. #' @param resolution Resolution parameter. Higher values result in more clusters #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). #' For the meaning of each option, see `igraph::cluster_leiden()`. @@ -141,44 +248,55 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads= #' @param ... Additional arguments to underlying clustering function #' @return Factor vector containing the cluster assignment for each cell. #' @export -cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) { +cluster_graph_leiden <- function( + mat, resolution = 1, objective_function = c("modularity", "CPM"), + seed = 12531, ... +) { assert_has_package("igraph") # Set seed without permanently changing seed state + if (rlang::is_missing(mat)) return(create_partial()) prev_seed <- get_seed() on.exit(restore_seed(prev_seed), add = TRUE) set.seed(seed) objective_function <- match.arg(objective_function) - igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>% + igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>% igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>% igraph::membership() %>% as.factor() } -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()` #' @export -cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) { +cluster_graph_louvain <- function( + mat, resolution = 1, seed = 12531 +) { assert_has_package("igraph") # Set seed without permanently changing seed state + if (rlang::is_missing(mat)) return(create_partial()) + prev_seed <- get_seed() on.exit(restore_seed(prev_seed), add = TRUE) set.seed(seed) - igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>% + igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>% igraph::cluster_louvain(resolution = resolution) %>% igraph::membership() %>% as.factor() } -#' @rdname cluster +#' @rdname cluster_graph #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()` #' @export -cluster_graph_seurat <- function(snn, resolution = 0.8, ...) { +cluster_graph_seurat <- function( + mat, resolution = 0.8, ... +) { assert_has_package("Seurat") - Seurat::as.Graph(snn) %>% + if (rlang::is_missing(mat)) return(create_partial()) + Seurat::as.Graph(mat) %>% Seurat::FindClusters(resolution = resolution, ...) %>% .[[1]] } @@ -213,7 +331,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) { } -#' Get a knn matrix from reduced dimensions +#' Get a knn object from reduced dimensions #' #' Search for approximate nearest neighbors between cells in the reduced #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each @@ -228,16 +346,24 @@ cluster_membership_matrix <- function(groups, group_order = NULL) { #' @param metric distance metric to use #' @param threads Number of threads to use. Note that result is non-deterministic #' if threads > 1 -#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but +#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but #' improved accuracy #' @param verbose whether to print progress information during search -#' @return List of 2 matrices -- idx for cell x K neighbor indices, -#' dist for cell x K neighbor distances. -#' If no query is given, nearest neighbors are found mapping -#' the data matrix to itself, prohibiting self-neighbors +#' @return Named list of two matrices of dimension (cells x k): +#' - `idx`: Neighbor indices, where `idx[c, n]` is the index of the +#' nth nearest neighbor to cell `c`. +#' - `dist`: Neighbor distances, where `dist[c, n]` is the distance between +#' cell `c` and its nth nearest neighbor. +#' +#' If no query is given, nearest neighbors are found by mapping the data matrix to itself, +#' likely including self-neighbors (i.e. `idx[c,1] == c` for most cells). #' @export knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) { metric <- match.arg(metric) + assert_is(verbose, "logical") + assert_is_wholenumber(threads) + assert_has_package("RcppHNSW") + if (rlang::is_missing(data)) return(create_partial()) index <- RcppHNSW::hnsw_build( data, distance = metric, @@ -271,8 +397,11 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine #' @param n_trees Number of trees during index build time. More trees gives higher accuracy #' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy #' @export -knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) { +knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) { metric <- match.arg(metric) + assert_has_package("RcppAnnoy") + if (rlang::is_missing(data)) return(create_partial()) + if (is.null(query)) query <- data annoy <- switch(metric, "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)), "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)), @@ -294,5 +423,6 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin dist[i, ] <- res$dist } if (metric == "cosine") dist <- 0.5 * (dist * dist) - list(idx = idx, dist = dist) -} + res <- list(idx = idx, dist = dist) + return(res) +} \ No newline at end of file diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R index a2378fa6..441fcc29 100644 --- a/r/R/errorChecking.R +++ b/r/R/errorChecking.R @@ -114,6 +114,13 @@ assert_is <- function(object, class, n = 1) { } } +assert_is_mat <- function(object, n = 1) { + # matrices have length set to row*col instead of being 1, so we need to check dim as well + if (!canCoerce(object, "IterableMatrix")) { + pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n) + } +} + assert_true <- function(expr, n = 1) { if (!expr) pretty_error(expr, "is not true", n) } diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..70f7c4e1 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -45,15 +45,91 @@ document_granges <- function( ), intro_noun, bullets) } +# Add current timestamp to a character string. +add_timestamp <- function(msg) { + return(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg)) +} + # Function which prints a message using shell echo. # Useful for printing messages from inside mclapply when running in Rstudio. log_progress <- function(msg, add_timestamp = TRUE){ if (add_timestamp) { - msg <- paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg) + msg <- add_timestamp(msg) } if (.Platform$GUI == "RStudio") { system(sprintf('echo "%s"', paste0(msg, collapse=""))) } else { message(msg) } +} + +#' Helper to create partial functions +#' +#' Automatically creates a partial application of the caller +#' function including all non-missing arguments. +#' +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +create_partial <- function() { + env <- rlang::caller_env() + fn_sym <- rlang::caller_call()[[1]] + fn <- rlang::caller_fn() + + args <- list() + for (n in names(formals(fn))) { + if (rlang::is_missing(env[[n]])) next + args[[n]] <- env[[n]] + } + + ret <- do.call(partial_apply, c(fn, args)) + attr(ret, "body")[[1]] <- fn_sym + return(ret) +} + + +#' Create partial function calls +#' +#' Specify some but not all arguments to a function. +#' +#' @param f A function +#' @param ... Named arguments to `f` +#' @param .overwrite (bool) If `f` is already an output from +#' `partial_apply()`, whether parameter re-definitions should +#' be ignored or overwrite the existing definitions +#' @param .missing_args_error (bool) If `TRUE`, passing in arguments +#' that are not in the function's signature will raise an error, otherwise +#' they will be ignored +#' @return A `bpcells_partial` object (a function with some extra attributes) +#' @keywords internal +partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) { + args <- rlang::list2(...) + + if (is(f, "bpcells_partial")) { + prev_args <- attr(f, "args") + for (a in names(prev_args)) { + if (!(.overwrite && a %in% names(args))) { + args[[a]] <- prev_args[[a]] + } + } + f <- attr(f, "fn") + function_name <- attr(f, "body")[[1]] + } else { + function_name <- rlang::sym(rlang::caller_arg(f)) + } + # See which arguments do not exist in f + missing_args <- which(!names(args) %in% names(formals(f))) + if (length(missing_args) > 0) { + if (.missing_args_error) { + stop(sprintf("Arguments %s are not in the function signature", paste0(names(args)[missing_args], collapse=", "))) + } else { + args <- args[-missing_args]} + } + partial_fn <- do.call(purrr::partial, c(f, args)) + attr(partial_fn, "body")[[1]] <- function_name + structure( + partial_fn, + class = c("bpcells_partial", "purrr_function_partial", "function"), + args = args, + fn = f + ) } \ No newline at end of file diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd new file mode 100644 index 00000000..0aaaa807 --- /dev/null +++ b/r/man/cluster_cells_graph.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{cluster_cells_graph} +\alias{cluster_cells_graph} +\title{Cluster cell embeddings using a KNN graph-based algorithm} +\usage{ +cluster_cells_graph( + mat, + knn_method = knn_hnsw, + knn_to_graph_method = knn_to_geodesic_graph, + cluster_graph_method = cluster_graph_leiden, + threads = 0L, + verbose = FALSE +) +} +\arguments{ +\item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}} + +\item{knn_method}{(function) Function to that takes an embedding matrix as the first argument and returns a k nearest neighbors (KNN) object. +For example, \code{knn_hnsw()}, \code{knn_annoy()}, or a parameterized version (see Details).} + +\item{knn_to_graph_method}{(function) Function that takes a KNN object and returns a graph as an undirected graph (lower-triangular dgCMatrix adjacency matrix). +For example, \code{knn_to_graph()}, \code{knn_to_snn_graph()}, \code{knn_to_geodesic_graph()}, or a parameterized version (see Details).} + +\item{cluster_graph_method}{(function) Function that takes an undirected graph of cell similarity and returns a factor with cluster assignments for each cell. +For example, \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()}, \code{cluster_graph_seurat()}, or a parameterized version (see Details).} + +\item{threads}{(integer) Number of threads to use in \code{knn_method}, \code{knn_to_graph_method} and \code{cluster_graph_method}. If these functions do not utilize +a \code{threads} argument, this is silently ignored.} + +\item{verbose}{(logical) Whether to print progress information in \code{knn_method}, \code{knn_to_graph_method} and \code{cluster_graph_method}. If these functions do not utilize +a \code{verbose} argument, this is silently ignored.} +} +\value{ +(factor) Factor vector containing the cluster assignment for each cell. +} +\description{ +Take in a cell embedding matrix, then find k nearest neighbors (KNN) for each cell, convert the +KNN into a graph (adjacency matrix), then run a graph-based clustering algorithm. Each of these +steps can be customized by passing a function that performs the step (see details). +} +\details{ +\strong{Customizing clustering steps} + +All of the BPCells functions named like \verb{knn_*}, \verb{knn_to_graph_*}, and \verb{cluster_graph_*} support customizing parameters +via partial function application. For example, look for 20 neighbors during the k nearest neighbors search, setting +\code{knn_method=knn_hnsw(k=20)} is a convenient shortcut for \code{knn_method=function(x) knn_hnsw(x, k=20)}. Similarly, lowering the +default clustering resolution can be done as \code{cluster_graph_method=cluster_graph_louvain(resolution=0.5)}. This works because +all these functions are written to return a partially parameterized copy of themselves as a function object when their +first argument is missing. + +For even more advanced customization, users can manually call the \code{knn}, \code{knn_to_graph}, and \code{cluster_graph} methods rather +than using \code{cluster_cells_graph()} as a convenient wrapper. + +\strong{Implementing custom clustering steps} + +The required interfaces for each step are as follows: + +\strong{knn_method}: First argument is a matrix of cell embeddings, shape \verb{(cells x n_embeddings)}. +Returns a named list of two matrices of dimension (cells x k): +\itemize{ +\item \code{idx}: Neighbor indices, where \code{idx[c, n]} is the index of the +nth nearest neighbor to cell \code{c}. +\item \code{dist}: Neighbor distances, where \code{dist[c, n]} is the distance between +cell \code{c} and its nth nearest neighbor. +Self-neighbors are allowed, so with sufficient search effort \code{idx[c,1] == c} for nearly all cells. +} + +\strong{knn_to_graph_method}: First argument is a KNN object as returned by \code{knn_method}. +Returns a weighted similarity graph as a lower triangular sparse adjacency matrix (\code{dgCMatrix}). +For cells \code{i} and \code{j}, their similarity score is in \code{adjacency_mat[max(i,j), min(i,j)]}. + +\strong{cluster_graph_method}: First argument is a weighted similarity graph as returned by \code{knn_to_graph_method}. +Returns a factor vector of length \code{cells} with a cluster assignment for each cell. +} +\seealso{ +\code{knn_hnsw()} \code{knn_annoy()} \code{knn_to_graph()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()} +} diff --git a/r/man/cluster.Rd b/r/man/cluster_graph.Rd similarity index 88% rename from r/man/cluster.Rd rename to r/man/cluster_graph.Rd index ce47c1db..0794ca6c 100644 --- a/r/man/cluster.Rd +++ b/r/man/cluster_graph.Rd @@ -7,19 +7,19 @@ \title{Cluster an adjacency matrix} \usage{ cluster_graph_leiden( - snn, + mat, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ... ) -cluster_graph_louvain(snn, resolution = 1, seed = 12531) +cluster_graph_louvain(mat, resolution = 1, seed = 12531) -cluster_graph_seurat(snn, resolution = 0.8, ...) +cluster_graph_seurat(mat, resolution = 0.8, ...) } \arguments{ -\item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used} +\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used.} \item{resolution}{Resolution parameter. Higher values result in more clusters} diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd new file mode 100644 index 00000000..b8bc6939 --- /dev/null +++ b/r/man/create_partial.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{create_partial} +\alias{create_partial} +\title{Helper to create partial functions} +\usage{ +create_partial() +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Automatically creates a partial application of the caller +function including all non-missing arguments. +} +\keyword{internal} diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd new file mode 100644 index 00000000..c44ce669 --- /dev/null +++ b/r/man/is_adjacency_matrix.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{is_adjacency_matrix} +\alias{is_adjacency_matrix} +\title{Check if an input is a graph adjacency matrix.} +\usage{ +is_adjacency_matrix(mat) +} +\value{ +TRUE if the mat is a graph adjacency matrix, FALSE otherwise +} +\description{ +Clustering functions like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input. +We assume that any square \code{dgCMatrix} is a graph adjacency matrix. +} +\keyword{internal} diff --git a/r/man/is_knn_object.Rd b/r/man/is_knn_object.Rd new file mode 100644 index 00000000..5713e23f --- /dev/null +++ b/r/man/is_knn_object.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{is_knn_object} +\alias{is_knn_object} +\title{Check if an input is a kNN object} +\usage{ +is_knn_object(mat) +} +\value{ +TRUE if the mat is a knn object, FALSE otherwise +} +\description{ +knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}. +These are used as inputs to create graph adjacency matrices for clustering. +Assume any list with at least both \code{idx} and \code{dist} items is a kNN object. +} +\keyword{internal} diff --git a/r/man/knn.Rd b/r/man/knn.Rd index 3efbc9e7..07d97a07 100644 --- a/r/man/knn.Rd +++ b/r/man/knn.Rd @@ -3,7 +3,7 @@ \name{knn_hnsw} \alias{knn_hnsw} \alias{knn_annoy} -\title{Get a knn matrix from reduced dimensions} +\title{Get a knn object from reduced dimensions} \usage{ knn_hnsw( data, @@ -17,7 +17,7 @@ knn_hnsw( knn_annoy( data, - query = data, + query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, @@ -38,7 +38,7 @@ knn_annoy( \item{threads}{Number of threads to use. Note that result is non-deterministic if threads > 1} -\item{ef}{ef parameter for RccppHNSW::hnsw_search. Increase for slower search but +\item{ef}{ef parameter for \code{RcppHNSW::hnsw_search()}. Increase for slower search but improved accuracy} \item{n_trees}{Number of trees during index build time. More trees gives higher accuracy} @@ -46,10 +46,16 @@ improved accuracy} \item{search_k}{Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy} } \value{ -List of 2 matrices -- idx for cell x K neighbor indices, -dist for cell x K neighbor distances. -If no query is given, nearest neighbors are found mapping -the data matrix to itself, prohibiting self-neighbors +Named list of two matrices of dimension (cells x k): +\itemize{ +\item \code{idx}: Neighbor indices, where \code{idx[c, n]} is the index of the +nth nearest neighbor to cell \code{c}. +\item \code{dist}: Neighbor distances, where \code{dist[c, n]} is the distance between +cell \code{c} and its nth nearest neighbor. +} + +If no query is given, nearest neighbors are found by mapping the data matrix to itself, +likely including self-neighbors (i.e. \code{idx[c,1] == c} for most cells). } \description{ Search for approximate nearest neighbors between cells in the reduced diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd new file mode 100644 index 00000000..149eb19f --- /dev/null +++ b/r/man/partial_apply.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{partial_apply} +\alias{partial_apply} +\title{Create partial function calls} +\usage{ +partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) +} +\arguments{ +\item{f}{A function} + +\item{...}{Named arguments to \code{f}} + +\item{.overwrite}{(bool) If \code{f} is already an output from +\code{partial_apply()}, whether parameter re-definitions should +be ignored or overwrite the existing definitions} + +\item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments +that are not in the function's signature will raise an error, otherwise +they will be ignored} +} +\value{ +A \code{bpcells_partial} object (a function with some extra attributes) +} +\description{ +Specify some but not all arguments to a function. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 1d4c6026..a8b2f2e9 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -152,6 +152,7 @@ reference: - title: "Clustering" - contents: + - cluster_cells_graph - knn_hnsw - cluster_graph_leiden - knn_to_graph diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R index b17723c2..f597eaeb 100644 --- a/r/tests/testthat/test-clustering.R +++ b/r/tests/testthat/test-clustering.R @@ -1,4 +1,4 @@ -# Copyright 2024 BPCells contributors +# Copyright 2025 BPCells contributors # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -19,11 +19,10 @@ test_that("C++ SNN calculation works",{ neighbor_sim[i,] <- sample.int(cells, k) } } - + input <- list(idx = neighbor_sim, dist = matrix(runif(cells*k), nrow=cells)) min_val <- 1/15 - snn <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=TRUE) - - mat <- knn_to_graph(list(idx=neighbor_sim), use_weights=FALSE) + snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE) + mat <- knn_to_graph(input, use_weights=FALSE) mat <- mat %*% t(mat) mat <- mat / (2 * k - mat) @@ -35,7 +34,7 @@ test_that("C++ SNN calculation works",{ snn, as(mat, "dgCMatrix") ) - snn2 <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=FALSE) + snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE) diag(mat) <- 0 mat <- Matrix::drop0(mat) expect_identical( @@ -73,4 +72,23 @@ test_that("igraph clustering doesn't crash", { }) expect_no_condition(cluster_graph_louvain(graph)) +}) + +test_that("cluster_cells_graph works", { + skip_if_not_installed("RcppAnnoy") + skip_if_not_installed("RcppHNSW") + mat <- matrix(sample.int(1000, 10000, replace=TRUE), nrow=1000) + # check with default params + res <- expect_no_error(cluster_cells_graph(mat)) + # check with threads, method partialization + expect_true(class(res) == "factor") + expect_equal(nrow(mat), length(res)) + res_partialized <- expect_no_error( + cluster_cells_graph( + mat, knn_method = knn_annoy(k = 9), + knn_to_graph_method = knn_to_snn_graph(min_val = 1/10), + cluster_graph_method = cluster_graph_louvain(resolution = 0.8), + )) + expect_true(class(res) == "factor") + expect_equal(nrow(mat), length(res)) }) \ No newline at end of file diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R new file mode 100644 index 00000000..355a5d29 --- /dev/null +++ b/r/tests/testthat/test-errorChecking.R @@ -0,0 +1,19 @@ +# Copyright 2025 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +test_that("assert_is_mat works", { + mat <- matrix(1:4, nrow = 2) + mat_dgc <- as(mat, "dgCMatrix") + mat_iterable <- as(mat, "IterableMatrix") + expect_no_error(assert_is_mat(mat)) + expect_no_error(assert_is_mat(mat_dgc)) + expect_error(assert_is_mat(c(mat_iterable, mat_iterable))) + expect_error(assert_is_mat("a")) + expect_error(assert_is_mat(c("a", "a"))) + expect_error(assert_is_mat(1)) +}) \ No newline at end of file From 569e1df53740bd6b6b1bf6dac0e3a7c1ab6d90ec Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Wed, 30 Jul 2025 14:10:39 -0700 Subject: [PATCH 2/3] [r] update NEWS.md --- r/NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index f5998f88..0f769caa 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -1,8 +1,10 @@ # BPCells 0.4.0 (in-progress main branch) ## Breaking changes -- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type. (pull request #189) +- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type. (pull request #292) +## Features +- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #292) # BPCells 0.3.1 (7/21/2025) From 3df0e3570d2cf853c7defd5df2ea4158cd8fe417 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Wed, 30 Jul 2025 14:46:20 -0700 Subject: [PATCH 3/3] [r] small update to website for `qc_scATAC` --- r/R/atac_utils.R | 2 +- r/man/qc_scATAC.Rd | 2 +- r/tools/hwy-test.cpp | 7 ------- 3 files changed, 2 insertions(+), 9 deletions(-) delete mode 100644 r/tools/hwy-test.cpp diff --git a/r/R/atac_utils.R b/r/R/atac_utils.R index 961f63e0..e78d1493 100644 --- a/r/R/atac_utils.R +++ b/r/R/atac_utils.R @@ -176,7 +176,7 @@ footprint <- function(fragments, ranges, zero_based_coords = !is(ranges, "GRange #' as `(monoNucleosomal + multiNucleosomal) / subNucleosomal`. #' @examples #' ## Prep data -#' frags <- get_demo_frags() +#' frags <- get_demo_frags(subset = FALSE) #' reference_dir <- file.path(tempdir(), "references") #' genes <- read_gencode_transcripts( #' reference_dir, diff --git a/r/man/qc_scATAC.Rd b/r/man/qc_scATAC.Rd index fcf20061..476b9a7b 100644 --- a/r/man/qc_scATAC.Rd +++ b/r/man/qc_scATAC.Rd @@ -54,7 +54,7 @@ as \code{(monoNucleosomal + multiNucleosomal) / subNucleosomal}. } \examples{ ## Prep data -frags <- get_demo_frags() +frags <- get_demo_frags(subset = FALSE) reference_dir <- file.path(tempdir(), "references") genes <- read_gencode_transcripts( reference_dir, diff --git a/r/tools/hwy-test.cpp b/r/tools/hwy-test.cpp deleted file mode 100644 index 4b49f547..00000000 --- a/r/tools/hwy-test.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include -#include - -int main() { - auto alignedAlloc = hwy::AllocateAligned(1024); - return 0; -} \ No newline at end of file