From d28c0e9e615008e347684a33407597a90bfa50dc Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Mon, 28 Jul 2025 13:43:20 -0700
Subject: [PATCH 1/3] [r] add clustering + partialized function changes

---
 r/DESCRIPTION                          |   4 +-
 r/NAMESPACE                            |   1 +
 r/NEWS.md                              |   6 +
 r/R/clustering.R                       | 184 +++++++++++++++++++++----
 r/R/errorChecking.R                    |   7 +
 r/R/utils.R                            |  78 ++++++++++-
 r/man/cluster_cells_graph.Rd           |  78 +++++++++++
 r/man/{cluster.Rd => cluster_graph.Rd} |   8 +-
 r/man/create_partial.Rd                |  16 +++
 r/man/is_adjacency_matrix.Rd           |  16 +++
 r/man/is_knn_object.Rd                 |  17 +++
 r/man/knn.Rd                           |  20 ++-
 r/man/partial_apply.Rd                 |  28 ++++
 r/pkgdown/_pkgdown.yml                 |   1 +
 r/tests/testthat/test-clustering.R     |  30 +++-
 r/tests/testthat/test-errorChecking.R  |  19 +++
 16 files changed, 467 insertions(+), 46 deletions(-)
 create mode 100644 r/man/cluster_cells_graph.Rd
 rename r/man/{cluster.Rd => cluster_graph.Rd} (88%)
 create mode 100644 r/man/create_partial.Rd
 create mode 100644 r/man/is_adjacency_matrix.Rd
 create mode 100644 r/man/is_knn_object.Rd
 create mode 100644 r/man/partial_apply.Rd
 create mode 100644 r/tests/testthat/test-errorChecking.R

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 7fada48d..3dd6ad3f 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -49,7 +49,9 @@ Suggests:
     IRanges,
     GenomicRanges,
     matrixStats,
-    igraph
+    igraph,
+    RcppHNSW,
+    RcppAnnoy
 Depends: 
     R (>= 4.0.0)
 Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs, png, magrittr
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 29a7cdcc..a7f473cf 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -21,6 +21,7 @@ export(canonical_gene_symbol)
 export(cellNames)
 export(checksum)
 export(chrNames)
+export(cluster_cells_graph)
 export(cluster_graph_leiden)
 export(cluster_graph_louvain)
 export(cluster_graph_seurat)
diff --git a/r/NEWS.md b/r/NEWS.md
index 4c9e6988..f5998f88 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -1,3 +1,9 @@
+# BPCells 0.4.0 (in-progress main branch)
+
+## Breaking changes
+- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
+
+
 # BPCells 0.3.1 (7/21/2025)
 
 The BPCells 0.3.1 release covers 7 months of changes and 40 commits from 5 contributors. Notable changes include writing matrices in AnnData's dense format, 
diff --git a/r/R/clustering.R b/r/R/clustering.R
index b53a4edf..9b1dd58d 100644
--- a/r/R/clustering.R
+++ b/r/R/clustering.R
@@ -6,6 +6,103 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#' Check if an input is a kNN object
+#'
+#' knn object functions `knn_hnsw()` and `knn_annoy()` return a list of two matrices, `idx` and `dist`.
+#' These are used as inputs to create graph adjacency matrices for clustering.
+#' Assume any list with at least both `idx` and `dist` items is a kNN object.
+#' @return TRUE if the mat is a knn object, FALSE otherwise
+#' @keywords internal
+is_knn_object <- function(mat) {
+  return(is(mat, "list") && all(c("idx", "dist") %in% names(mat)))
+}
+
+#' Check if an input is a graph adjacency matrix.
+#'
+#' Clustering functions like `cluster_graph_leiden()` and `cluster_graph_louvain()` require a graph adjacency matrix as input.
+#' We assume that any square `dgCMatrix` is a graph adjacency matrix.
+#' @return TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+#' @keywords internal
+is_adjacency_matrix <- function(mat) {
+  return(is(mat, "dgCMatrix") && nrow(mat) == ncol(mat))
+}
+
+#' Cluster cell embeddings using a KNN graph-based algorithm
+#' 
+#' Take in a cell embedding matrix, then find k nearest neighbors (KNN) for each cell, convert the
+#' KNN into a graph (adjacency matrix), then run a graph-based clustering algorithm. Each of these
+#' steps can be customized by passing a function that performs the step (see details).
+#' 
+#' @param mat (matrix) Cell embeddings matrix of shape `(cells x n_embeddings)`
+#' @param knn_method (function) Function to that takes an embedding matrix as the first argument and returns a k nearest neighbors (KNN) object.
+#'   For example, `knn_hnsw()`, `knn_annoy()`, or a parameterized version (see Details).
+#' @param knn_to_graph_method (function) Function that takes a KNN object and returns a graph as an undirected graph (lower-triangular dgCMatrix adjacency matrix).
+#'   For example, `knn_to_graph()`, `knn_to_snn_graph()`, `knn_to_geodesic_graph()`, or a parameterized version (see Details).
+#' @param cluster_graph_method (function) Function that takes an undirected graph of cell similarity and returns a factor with cluster assignments for each cell.
+#'   For example, `cluster_graph_leiden()`, `cluster_graph_louvain()`, `cluster_graph_seurat()`, or a parameterized version (see Details).
+#' @param threads (integer) Number of threads to use in `knn_method`, `knn_to_graph_method` and `cluster_graph_method`.  If these functions do not utilize
+#' a `threads` argument, this is silently ignored.
+#' @param verbose (logical) Whether to print progress information in `knn_method`, `knn_to_graph_method` and `cluster_graph_method`.  If these functions do not utilize 
+#' a `verbose` argument, this is silently ignored.
+#' @returns (factor) Factor vector containing the cluster assignment for each cell.
+#' @details 
+#' 
+#' **Customizing clustering steps**
+#' 
+#' All of the BPCells functions named like `knn_*`, `knn_to_graph_*`, and `cluster_graph_*` support customizing parameters
+#' via partial function application. For example, look for 20 neighbors during the k nearest neighbors search, setting
+#' `knn_method=knn_hnsw(k=20)` is a convenient shortcut for `knn_method=function(x) knn_hnsw(x, k=20)`. Similarly, lowering the 
+#'  default clustering resolution can be done as `cluster_graph_method=cluster_graph_louvain(resolution=0.5)`. This works because
+#' all these functions are written to return a partially parameterized copy of themselves as a function object when their
+#' first argument is missing.
+#'
+#' For even more advanced customization, users can manually call the `knn`, `knn_to_graph`, and `cluster_graph` methods rather
+#' than using `cluster_cells_graph()` as a convenient wrapper.
+#'
+#' **Implementing custom clustering steps**
+#' 
+#' The required interfaces for each step are as follows:
+#'
+#' **knn_method**: First argument is a matrix of cell embeddings, shape `(cells x n_embeddings)`.
+#'  Returns a named list of two matrices of dimension (cells x k):
+#'  - `idx`: Neighbor indices, where `idx[c, n]` is the index of the
+#'    nth nearest neighbor to cell `c`.
+#'  - `dist`: Neighbor distances, where `dist[c, n]` is the distance between
+#'    cell `c` and its nth nearest neighbor.
+#'  Self-neighbors are allowed, so with sufficient search effort `idx[c,1] == c` for nearly all cells.
+#' 
+#' **knn_to_graph_method**: First argument is a KNN object as returned by `knn_method`.
+#'  Returns a weighted similarity graph as a lower triangular sparse adjacency matrix (`dgCMatrix`). 
+#'  For cells `i` and `j`, their similarity score is in `adjacency_mat[max(i,j), min(i,j)]`.
+#'
+#' **cluster_graph_method**: First argument is a weighted similarity graph as returned by `knn_to_graph_method`.
+#'  Returns a factor vector of length `cells` with a cluster assignment for each cell.
+#'
+#' @seealso `knn_hnsw()` `knn_annoy()` `knn_to_graph()` `knn_to_snn_graph()` `knn_to_geodesic_graph()` `cluster_graph_leiden()` `cluster_graph_louvain()` `cluster_graph_seurat()` 
+#' @export
+cluster_cells_graph <- function(
+  mat, knn_method = knn_hnsw, 
+  knn_to_graph_method = knn_to_geodesic_graph, 
+  cluster_graph_method = cluster_graph_leiden, 
+  threads = 0L, verbose = FALSE
+) {
+  assert_is_wholenumber(threads)
+  assert_is(verbose, "logical")
+  if (rlang::is_missing(mat)) return(create_partial())
+  assert_is(mat, "matrix")
+  # There currently aren't any `knn_to_graph` functions that utilize a verbose argument.  
+  # However, we still pass `verbose` in case future functions do provide this functionality.
+  mat <- partial_apply(knn_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  if (!is_knn_object(mat)) pretty_error(mat, "`knn_method` was unable to convert `mat` into a knn object", 1)
+  # Return type has to be constrained to "matrix", so this is silently provided.
+  mat <- partial_apply(knn_to_graph_method, threads = threads, verbose = verbose, return_type = "matrix", .missing_args_error = FALSE)(mat)
+  if (!is_adjacency_matrix(mat)) pretty_error(mat, "`knn_to_graph_method` was unable to convert `mat` from a knn object to a graph adjacency matrix", 1)
+  # Also pass verbose and threads to clustering functions in case they are given these params in the future
+  mat <- partial_apply(cluster_graph_method, threads = threads, verbose = verbose, .missing_args_error = FALSE)(mat)
+  return(mat)
+}
+
 #' K Nearest Neighbor (KNN) Graph
 #'
 #' Convert a KNN object (e.g. returned by `knn_hnsw()` or `knn_annoy()`) into
@@ -22,6 +119,11 @@
 #'   Sparse matrix (dgCMatrix) where `mat[i,j]` = distance from cell `i` to
 #'   cell `j`, or 0 if cell `j` is not in the K nearest neighbors of `i`
 knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
+  assert_is(use_weights, "logical")
+  assert_is(self_loops, "logical")
+
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   if (use_weights) {
     weights <- knn$dist
   } else {
@@ -41,7 +143,6 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
   mat
 }
 
-
 #' @rdname knn_graph
 #' @details **knn_to_snn_graph**
 #'  Convert a knn object into a shared nearest neighbors adjacency matrix.
@@ -63,6 +164,9 @@ knn_to_graph <- function(knn, use_weights = FALSE, self_loops = TRUE) {
 #' @export
 knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_type=c("matrix", "list")) {
   return_type <- match.arg(return_type)
+  assert_is(self_loops, "logical")
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   # Solve x / (2*K - x) >= min_val --> x >= 2*K*min_val / (1 + min_val)
   min_int <- ceiling(2*min_val*ncol(knn$idx) / (1 + min_val))
   snn <- build_snn_graph_cpp(knn$idx, min_neighbors = min_int)
@@ -83,13 +187,15 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
   }
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = snn$i + 1L, j = snn$j + 1L, x = snn$weight,
     dims = c(snn$dim, snn$dim)
   )
+  return(res)
 }
 
 #' @rdname knn_graph
+#' @param threads Number of threads to use during calculations
 #' @details **knn_to_geodesic_graph**
 #'  Convert a knn object into an undirected weighted graph, using the same 
 #'  geodesic distance estimation method as the UMAP package. 
@@ -101,7 +207,6 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'  neighbor, results may differ slightly from `umap._umap.fuzzy_simplicial_set`, which
 #'  assumes self is always successfully found in the approximate nearest neighbor search.
 #'  
-#' @param threads Number of threads to use during calculations
 #' @return **knn_to_geodesic_graph**
 #' - `return_type == "matrix"`:
 #'      Sparse matrix (dgCMatrix) where `mat[i,j]` = normalized similarity between cell `i` and cell `j`.
@@ -111,29 +216,31 @@ knn_to_snn_graph <- function(knn, min_val = 1 / 15, self_loops = FALSE, return_t
 #'      These correspond to the rows, cols, and values of non-zero entries in the lower triangle
 #'      adjacency matrix. `dim` is the total number of vertices (cells) in the graph 
 #' @export
-knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=0L) {
+knn_to_geodesic_graph <- function(knn, return_type = c("matrix", "list"), threads = 0L) {
   return_type <- match.arg(return_type)
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(knn)) return(create_partial())
+  assert_true(is_knn_object(knn))
   graph <- build_umap_graph_cpp(knn$dist, knn$idx, threads=threads)
   
   graph$dim <- nrow(knn$idx)
-  if (return_type == "list") {
-    return(graph)
-  }
+  if (return_type == "list") return(graph)
   
   # Return as a sparse matrix
-  Matrix::sparseMatrix(
+  res <- Matrix::sparseMatrix(
     i = graph$i + 1L, j = graph$j + 1L, x = graph$weight,
     dims = c(graph$dim, graph$dim)
   )
+  return(res)
 }
 
 #' Cluster an adjacency matrix
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_leiden**: Leiden clustering algorithm `igraph::cluster_leiden()`. 
 #'    Note that when using `objective_function = "CPM"` the number of clusters empirically scales with `cells * resolution`,
 #'    so 1e-3 is a good resolution for 10k cells, but 1M cells is better with a 1e-5 resolution. A resolution of 1 is a 
 #'    good default when `objective_function = "modularity"` per the default.
-#' @param snn Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used
+#' @param mat Symmetric adjacency matrix (dgCMatrix) output from e.g. `knn_to_snn_graph()` or `knn_to_geodesic_graph()`. Only the lower triangle is used.
 #' @param resolution Resolution parameter. Higher values result in more clusters
 #' @param objective_function Graph statistic to optimize during clustering. Modularity is the default as it keeps resolution independent of dataset size (see details below). 
 #'    For the meaning of each option, see `igraph::cluster_leiden()`.
@@ -141,44 +248,55 @@ knn_to_geodesic_graph <- function(knn, return_type=c("matrix", "list"), threads=
 #' @param ... Additional arguments to underlying clustering function
 #' @return Factor vector containing the cluster assignment for each cell.
 #' @export
-cluster_graph_leiden <- function(snn, resolution = 1, objective_function = c("modularity", "CPM"), seed = 12531, ...) {
+cluster_graph_leiden <- function(
+  mat, resolution = 1, objective_function = c("modularity", "CPM"),
+  seed = 12531, ...
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(mat)) return(create_partial())
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
 
   objective_function <- match.arg(objective_function)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_leiden(resolution_parameter = resolution, objective_function=objective_function, ...) %>%
     igraph::membership() %>%
     as.factor()
 }
 
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_louvain**: Louvain graph clustering algorithm `igraph::cluster_louvain()`
 #' @export
-cluster_graph_louvain <- function(snn, resolution = 1, seed = 12531) {
+cluster_graph_louvain <- function(
+  mat, resolution = 1, seed = 12531
+) {
   assert_has_package("igraph")
   # Set seed without permanently changing seed state
+  if (rlang::is_missing(mat)) return(create_partial())
+
   prev_seed <- get_seed()
   on.exit(restore_seed(prev_seed), add = TRUE)
   set.seed(seed)
 
-  igraph::graph_from_adjacency_matrix(snn, weighted = TRUE, diag = FALSE, mode = "lower") %>%
+  igraph::graph_from_adjacency_matrix(mat, weighted = TRUE, diag = FALSE, mode = "lower") %>%
     igraph::cluster_louvain(resolution = resolution) %>%
     igraph::membership() %>%
     as.factor()
 }
 
-#' @rdname cluster
+#' @rdname cluster_graph
 #' @details **cluster_graph_seurat**: Seurat's clustering algorithm `Seurat::FindClusters()`
 #' @export
-cluster_graph_seurat <- function(snn, resolution = 0.8, ...) {
+cluster_graph_seurat <- function(
+  mat, resolution = 0.8, ...
+) {
   assert_has_package("Seurat")
-  Seurat::as.Graph(snn) %>%
+  if (rlang::is_missing(mat)) return(create_partial())
+  Seurat::as.Graph(mat) %>%
     Seurat::FindClusters(resolution = resolution, ...) %>%
     .[[1]]
 }
@@ -213,7 +331,7 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 }
 
 
-#' Get a knn matrix from reduced dimensions
+#' Get a knn object from reduced dimensions
 #'
 #' Search for approximate nearest neighbors between cells in the reduced
 #' dimensions (e.g. PCA), and return the k nearest neighbors (knn) for each
@@ -228,16 +346,24 @@ cluster_membership_matrix <- function(groups, group_order = NULL) {
 #' @param metric distance metric to use
 #' @param threads Number of threads to use. Note that result is non-deterministic
 #'          if threads > 1
-#' @param ef ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+#' @param ef ef parameter for `RcppHNSW::hnsw_search()`. Increase for slower search but
 #'          improved accuracy
 #' @param verbose whether to print progress information during search
-#' @return List of 2 matrices -- idx for cell x K neighbor indices,
-#'         dist for cell x K neighbor distances.
-#'         If no query is given, nearest neighbors are found mapping
-#'         the data matrix to itself, prohibiting self-neighbors
+#' @return Named list of two matrices of dimension (cells x k):
+#'  - `idx`: Neighbor indices, where `idx[c, n]` is the index of the
+#'    nth nearest neighbor to cell `c`.
+#'  - `dist`: Neighbor distances, where `dist[c, n]` is the distance between
+#'    cell `c` and its nth nearest neighbor.
+#'
+#'  If no query is given, nearest neighbors are found by mapping the data matrix to itself, 
+#'  likely including self-neighbors (i.e. `idx[c,1] == c` for most cells).
 #' @export
 knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine"), verbose = TRUE, threads = 1, ef = 100) {
   metric <- match.arg(metric)
+  assert_is(verbose, "logical")
+  assert_is_wholenumber(threads)
+  assert_has_package("RcppHNSW")
+  if (rlang::is_missing(data)) return(create_partial())
   index <- RcppHNSW::hnsw_build(
     data,
     distance = metric,
@@ -271,8 +397,11 @@ knn_hnsw <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine
 #' @param n_trees Number of trees during index build time. More trees gives higher accuracy
 #' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy
 #' @export
-knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
+knn_annoy <- function(data, query = NULL, k = 10, metric = c("euclidean", "cosine", "manhattan", "hamming"), n_trees = 50, search_k = -1) {
   metric <- match.arg(metric)
+  assert_has_package("RcppAnnoy")
+  if (rlang::is_missing(data)) return(create_partial())
+  if (is.null(query)) query <- data
   annoy <- switch(metric,
     "euclidean" = new(RcppAnnoy::AnnoyEuclidean, ncol(data)),
     "cosine" = new(RcppAnnoy::AnnoyAngular, ncol(data)),
@@ -294,5 +423,6 @@ knn_annoy <- function(data, query = data, k = 10, metric = c("euclidean", "cosin
     dist[i, ] <- res$dist
   }
   if (metric == "cosine") dist <- 0.5 * (dist * dist)
-  list(idx = idx, dist = dist)
-}
+  res <- list(idx = idx, dist = dist)
+  return(res)
+}
\ No newline at end of file
diff --git a/r/R/errorChecking.R b/r/R/errorChecking.R
index a2378fa6..441fcc29 100644
--- a/r/R/errorChecking.R
+++ b/r/R/errorChecking.R
@@ -114,6 +114,13 @@ assert_is <- function(object, class, n = 1) {
   }
 }
 
+assert_is_mat <- function(object, n = 1) {
+  # matrices have length set to row*col instead of being 1, so we need to check dim as well
+  if (!canCoerce(object, "IterableMatrix")) {
+    pretty_error(object, "must either be an IterableMatrix or coercible to an IterableMatrix", n)
+  }
+}
+
 assert_true <- function(expr, n = 1) {
   if (!expr) pretty_error(expr, "is not true", n)
 }
diff --git a/r/R/utils.R b/r/R/utils.R
index 4ea62d15..70f7c4e1 100644
--- a/r/R/utils.R
+++ b/r/R/utils.R
@@ -45,15 +45,91 @@ document_granges <- function(
   ), intro_noun, bullets)
 }
 
+# Add current timestamp to a character string.
+add_timestamp <- function(msg) {
+  return(paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg))
+}
+
 # Function which prints a message using shell echo.
 # Useful for printing messages from inside mclapply when running in Rstudio.
 log_progress <- function(msg, add_timestamp = TRUE){
   if (add_timestamp) {
-    msg <- paste0(format(Sys.time(), "%Y-%m-%d %H:%M:%S "), msg)
+    msg <- add_timestamp(msg)
   }
   if (.Platform$GUI == "RStudio") {
     system(sprintf('echo "%s"', paste0(msg, collapse="")))
   } else {
     message(msg)
   }
+}
+
+#' Helper to create partial functions
+#'
+#' Automatically creates a partial application of the caller
+#' function including all non-missing arguments. 
+#'
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+create_partial <- function() {
+  env <- rlang::caller_env()
+  fn_sym <- rlang::caller_call()[[1]]
+  fn <- rlang::caller_fn()
+
+  args <- list()
+  for (n in names(formals(fn))) {
+    if (rlang::is_missing(env[[n]])) next
+    args[[n]] <- env[[n]]
+  }
+
+  ret <- do.call(partial_apply, c(fn, args))
+  attr(ret, "body")[[1]] <- fn_sym
+  return(ret)
+}
+
+
+#' Create partial function calls
+#' 
+#' Specify some but not all arguments to a function.
+#'
+#' @param f A function
+#' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an output from
+#'   `partial_apply()`, whether parameter re-definitions should
+#'   be ignored or overwrite the existing definitions
+#' @param .missing_args_error (bool) If `TRUE`, passing in arguments
+#'  that are not in the function's signature will raise an error, otherwise
+#'  they will be ignored
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+partial_apply <- function(f, ..., .overwrite = TRUE, .missing_args_error = TRUE) {
+  args <- rlang::list2(...)
+
+  if (is(f, "bpcells_partial")) {
+    prev_args <- attr(f, "args")
+    for (a in names(prev_args)) {
+      if (!(.overwrite && a %in% names(args))) {
+        args[[a]] <- prev_args[[a]]
+      }
+    }
+    f <- attr(f, "fn")
+    function_name <- attr(f, "body")[[1]]
+  } else {
+    function_name <- rlang::sym(rlang::caller_arg(f))
+  }
+  # See which arguments do not exist in f
+  missing_args <- which(!names(args) %in% names(formals(f)))
+  if (length(missing_args) > 0) {
+    if (.missing_args_error) {
+      stop(sprintf("Arguments %s are not in the function signature", paste0(names(args)[missing_args], collapse=", ")))
+    } else {
+      args <- args[-missing_args]}
+  }
+  partial_fn <- do.call(purrr::partial, c(f, args))
+  attr(partial_fn, "body")[[1]] <- function_name
+  structure(
+    partial_fn,
+    class = c("bpcells_partial", "purrr_function_partial", "function"),
+    args = args,
+    fn = f
+  )
 }
\ No newline at end of file
diff --git a/r/man/cluster_cells_graph.Rd b/r/man/cluster_cells_graph.Rd
new file mode 100644
index 00000000..0aaaa807
--- /dev/null
+++ b/r/man/cluster_cells_graph.Rd
@@ -0,0 +1,78 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{cluster_cells_graph}
+\alias{cluster_cells_graph}
+\title{Cluster cell embeddings using a KNN graph-based algorithm}
+\usage{
+cluster_cells_graph(
+  mat,
+  knn_method = knn_hnsw,
+  knn_to_graph_method = knn_to_geodesic_graph,
+  cluster_graph_method = cluster_graph_leiden,
+  threads = 0L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{mat}{(matrix) Cell embeddings matrix of shape \verb{(cells x n_embeddings)}}
+
+\item{knn_method}{(function) Function to that takes an embedding matrix as the first argument and returns a k nearest neighbors (KNN) object.
+For example, \code{knn_hnsw()}, \code{knn_annoy()}, or a parameterized version (see Details).}
+
+\item{knn_to_graph_method}{(function) Function that takes a KNN object and returns a graph as an undirected graph (lower-triangular dgCMatrix adjacency matrix).
+For example, \code{knn_to_graph()}, \code{knn_to_snn_graph()}, \code{knn_to_geodesic_graph()}, or a parameterized version (see Details).}
+
+\item{cluster_graph_method}{(function) Function that takes an undirected graph of cell similarity and returns a factor with cluster assignments for each cell.
+For example, \code{cluster_graph_leiden()}, \code{cluster_graph_louvain()}, \code{cluster_graph_seurat()}, or a parameterized version (see Details).}
+
+\item{threads}{(integer) Number of threads to use in \code{knn_method}, \code{knn_to_graph_method} and \code{cluster_graph_method}.  If these functions do not utilize
+a \code{threads} argument, this is silently ignored.}
+
+\item{verbose}{(logical) Whether to print progress information in \code{knn_method}, \code{knn_to_graph_method} and \code{cluster_graph_method}.  If these functions do not utilize
+a \code{verbose} argument, this is silently ignored.}
+}
+\value{
+(factor) Factor vector containing the cluster assignment for each cell.
+}
+\description{
+Take in a cell embedding matrix, then find k nearest neighbors (KNN) for each cell, convert the
+KNN into a graph (adjacency matrix), then run a graph-based clustering algorithm. Each of these
+steps can be customized by passing a function that performs the step (see details).
+}
+\details{
+\strong{Customizing clustering steps}
+
+All of the BPCells functions named like \verb{knn_*}, \verb{knn_to_graph_*}, and \verb{cluster_graph_*} support customizing parameters
+via partial function application. For example, look for 20 neighbors during the k nearest neighbors search, setting
+\code{knn_method=knn_hnsw(k=20)} is a convenient shortcut for \code{knn_method=function(x) knn_hnsw(x, k=20)}. Similarly, lowering the
+default clustering resolution can be done as \code{cluster_graph_method=cluster_graph_louvain(resolution=0.5)}. This works because
+all these functions are written to return a partially parameterized copy of themselves as a function object when their
+first argument is missing.
+
+For even more advanced customization, users can manually call the \code{knn}, \code{knn_to_graph}, and \code{cluster_graph} methods rather
+than using \code{cluster_cells_graph()} as a convenient wrapper.
+
+\strong{Implementing custom clustering steps}
+
+The required interfaces for each step are as follows:
+
+\strong{knn_method}: First argument is a matrix of cell embeddings, shape \verb{(cells x n_embeddings)}.
+Returns a named list of two matrices of dimension (cells x k):
+\itemize{
+\item \code{idx}: Neighbor indices, where \code{idx[c, n]} is the index of the
+nth nearest neighbor to cell \code{c}.
+\item \code{dist}: Neighbor distances, where \code{dist[c, n]} is the distance between
+cell \code{c} and its nth nearest neighbor.
+Self-neighbors are allowed, so with sufficient search effort \code{idx[c,1] == c} for nearly all cells.
+}
+
+\strong{knn_to_graph_method}: First argument is a KNN object as returned by \code{knn_method}.
+Returns a weighted similarity graph as a lower triangular sparse adjacency matrix (\code{dgCMatrix}).
+For cells \code{i} and \code{j}, their similarity score is in \code{adjacency_mat[max(i,j), min(i,j)]}.
+
+\strong{cluster_graph_method}: First argument is a weighted similarity graph as returned by \code{knn_to_graph_method}.
+Returns a factor vector of length \code{cells} with a cluster assignment for each cell.
+}
+\seealso{
+\code{knn_hnsw()} \code{knn_annoy()} \code{knn_to_graph()} \code{knn_to_snn_graph()} \code{knn_to_geodesic_graph()} \code{cluster_graph_leiden()} \code{cluster_graph_louvain()} \code{cluster_graph_seurat()}
+}
diff --git a/r/man/cluster.Rd b/r/man/cluster_graph.Rd
similarity index 88%
rename from r/man/cluster.Rd
rename to r/man/cluster_graph.Rd
index ce47c1db..0794ca6c 100644
--- a/r/man/cluster.Rd
+++ b/r/man/cluster_graph.Rd
@@ -7,19 +7,19 @@
 \title{Cluster an adjacency matrix}
 \usage{
 cluster_graph_leiden(
-  snn,
+  mat,
   resolution = 1,
   objective_function = c("modularity", "CPM"),
   seed = 12531,
   ...
 )
 
-cluster_graph_louvain(snn, resolution = 1, seed = 12531)
+cluster_graph_louvain(mat, resolution = 1, seed = 12531)
 
-cluster_graph_seurat(snn, resolution = 0.8, ...)
+cluster_graph_seurat(mat, resolution = 0.8, ...)
 }
 \arguments{
-\item{snn}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used}
+\item{mat}{Symmetric adjacency matrix (dgCMatrix) output from e.g. \code{knn_to_snn_graph()} or \code{knn_to_geodesic_graph()}. Only the lower triangle is used.}
 
 \item{resolution}{Resolution parameter. Higher values result in more clusters}
 
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd
new file mode 100644
index 00000000..b8bc6939
--- /dev/null
+++ b/r/man/create_partial.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{create_partial}
+\alias{create_partial}
+\title{Helper to create partial functions}
+\usage{
+create_partial()
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Automatically creates a partial application of the caller
+function including all non-missing arguments.
+}
+\keyword{internal}
diff --git a/r/man/is_adjacency_matrix.Rd b/r/man/is_adjacency_matrix.Rd
new file mode 100644
index 00000000..c44ce669
--- /dev/null
+++ b/r/man/is_adjacency_matrix.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_adjacency_matrix}
+\alias{is_adjacency_matrix}
+\title{Check if an input is a graph adjacency matrix.}
+\usage{
+is_adjacency_matrix(mat)
+}
+\value{
+TRUE if the mat is a graph adjacency matrix, FALSE otherwise
+}
+\description{
+Clustering functions like \code{cluster_graph_leiden()} and \code{cluster_graph_louvain()} require a graph adjacency matrix as input.
+We assume that any square \code{dgCMatrix} is a graph adjacency matrix.
+}
+\keyword{internal}
diff --git a/r/man/is_knn_object.Rd b/r/man/is_knn_object.Rd
new file mode 100644
index 00000000..5713e23f
--- /dev/null
+++ b/r/man/is_knn_object.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clustering.R
+\name{is_knn_object}
+\alias{is_knn_object}
+\title{Check if an input is a kNN object}
+\usage{
+is_knn_object(mat)
+}
+\value{
+TRUE if the mat is a knn object, FALSE otherwise
+}
+\description{
+knn object functions \code{knn_hnsw()} and \code{knn_annoy()} return a list of two matrices, \code{idx} and \code{dist}.
+These are used as inputs to create graph adjacency matrices for clustering.
+Assume any list with at least both \code{idx} and \code{dist} items is a kNN object.
+}
+\keyword{internal}
diff --git a/r/man/knn.Rd b/r/man/knn.Rd
index 3efbc9e7..07d97a07 100644
--- a/r/man/knn.Rd
+++ b/r/man/knn.Rd
@@ -3,7 +3,7 @@
 \name{knn_hnsw}
 \alias{knn_hnsw}
 \alias{knn_annoy}
-\title{Get a knn matrix from reduced dimensions}
+\title{Get a knn object from reduced dimensions}
 \usage{
 knn_hnsw(
   data,
@@ -17,7 +17,7 @@ knn_hnsw(
 
 knn_annoy(
   data,
-  query = data,
+  query = NULL,
   k = 10,
   metric = c("euclidean", "cosine", "manhattan", "hamming"),
   n_trees = 50,
@@ -38,7 +38,7 @@ knn_annoy(
 \item{threads}{Number of threads to use. Note that result is non-deterministic
 if threads > 1}
 
-\item{ef}{ef parameter for RccppHNSW::hnsw_search. Increase for slower search but
+\item{ef}{ef parameter for \code{RcppHNSW::hnsw_search()}. Increase for slower search but
 improved accuracy}
 
 \item{n_trees}{Number of trees during index build time. More trees gives higher accuracy}
@@ -46,10 +46,16 @@ improved accuracy}
 \item{search_k}{Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy}
 }
 \value{
-List of 2 matrices -- idx for cell x K neighbor indices,
-dist for cell x K neighbor distances.
-If no query is given, nearest neighbors are found mapping
-the data matrix to itself, prohibiting self-neighbors
+Named list of two matrices of dimension (cells x k):
+\itemize{
+\item \code{idx}: Neighbor indices, where \code{idx[c, n]} is the index of the
+nth nearest neighbor to cell \code{c}.
+\item \code{dist}: Neighbor distances, where \code{dist[c, n]} is the distance between
+cell \code{c} and its nth nearest neighbor.
+}
+
+If no query is given, nearest neighbors are found by mapping the data matrix to itself,
+likely including self-neighbors (i.e. \code{idx[c,1] == c} for most cells).
 }
 \description{
 Search for approximate nearest neighbors between cells in the reduced
diff --git a/r/man/partial_apply.Rd b/r/man/partial_apply.Rd
new file mode 100644
index 00000000..149eb19f
--- /dev/null
+++ b/r/man/partial_apply.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{partial_apply}
+\alias{partial_apply}
+\title{Create partial function calls}
+\usage{
+partial_apply(f, ..., .overwrite = TRUE, .missing_args_error = TRUE)
+}
+\arguments{
+\item{f}{A function}
+
+\item{...}{Named arguments to \code{f}}
+
+\item{.overwrite}{(bool) If \code{f} is already an output from
+\code{partial_apply()}, whether parameter re-definitions should
+be ignored or overwrite the existing definitions}
+
+\item{.missing_args_error}{(bool) If \code{TRUE}, passing in arguments
+that are not in the function's signature will raise an error, otherwise
+they will be ignored}
+}
+\value{
+A \code{bpcells_partial} object (a function with some extra attributes)
+}
+\description{
+Specify some but not all arguments to a function.
+}
+\keyword{internal}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 1d4c6026..a8b2f2e9 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -152,6 +152,7 @@ reference:
 
 - title: "Clustering"
 - contents:
+  - cluster_cells_graph
   - knn_hnsw
   - cluster_graph_leiden
   - knn_to_graph
diff --git a/r/tests/testthat/test-clustering.R b/r/tests/testthat/test-clustering.R
index b17723c2..f597eaeb 100644
--- a/r/tests/testthat/test-clustering.R
+++ b/r/tests/testthat/test-clustering.R
@@ -1,4 +1,4 @@
-# Copyright 2024 BPCells contributors
+# Copyright 2025 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -19,11 +19,10 @@ test_that("C++ SNN calculation works",{
                 neighbor_sim[i,] <- sample.int(cells, k)
             }
         }
-
+        input <- list(idx = neighbor_sim, dist = matrix(runif(cells*k), nrow=cells))
         min_val <- 1/15
-        snn <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=TRUE)
-
-        mat <- knn_to_graph(list(idx=neighbor_sim), use_weights=FALSE)
+        snn <- knn_to_snn_graph(input, min_val=min_val, self_loops=TRUE)
+        mat <- knn_to_graph(input, use_weights=FALSE)
 
         mat <- mat %*% t(mat)
         mat <- mat / (2 * k - mat)
@@ -35,7 +34,7 @@ test_that("C++ SNN calculation works",{
             snn,
             as(mat, "dgCMatrix")
         )
-        snn2 <- knn_to_snn_graph(list(idx=neighbor_sim), min_val=min_val, self_loops=FALSE)
+        snn2 <- knn_to_snn_graph(input, min_val=min_val, self_loops=FALSE)
         diag(mat) <- 0
         mat <- Matrix::drop0(mat)
         expect_identical(
@@ -73,4 +72,23 @@ test_that("igraph clustering doesn't crash", {
     })
 
     expect_no_condition(cluster_graph_louvain(graph))
+})
+
+test_that("cluster_cells_graph works", {
+    skip_if_not_installed("RcppAnnoy")
+    skip_if_not_installed("RcppHNSW")
+    mat <- matrix(sample.int(1000, 10000, replace=TRUE), nrow=1000)
+    # check with default params
+    res <- expect_no_error(cluster_cells_graph(mat))
+    # check with threads, method partialization
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
+    res_partialized <- expect_no_error(
+        cluster_cells_graph(
+            mat, knn_method = knn_annoy(k = 9),
+            knn_to_graph_method = knn_to_snn_graph(min_val = 1/10),
+            cluster_graph_method = cluster_graph_louvain(resolution = 0.8),
+        ))
+    expect_true(class(res) == "factor")
+    expect_equal(nrow(mat), length(res))
 })
\ No newline at end of file
diff --git a/r/tests/testthat/test-errorChecking.R b/r/tests/testthat/test-errorChecking.R
new file mode 100644
index 00000000..355a5d29
--- /dev/null
+++ b/r/tests/testthat/test-errorChecking.R
@@ -0,0 +1,19 @@
+# Copyright 2025 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+test_that("assert_is_mat works", {
+    mat <- matrix(1:4, nrow = 2)
+    mat_dgc <- as(mat, "dgCMatrix")
+    mat_iterable <- as(mat, "IterableMatrix")
+    expect_no_error(assert_is_mat(mat))
+    expect_no_error(assert_is_mat(mat_dgc))
+    expect_error(assert_is_mat(c(mat_iterable, mat_iterable)))
+    expect_error(assert_is_mat("a"))
+    expect_error(assert_is_mat(c("a", "a")))
+    expect_error(assert_is_mat(1))
+})
\ No newline at end of file

From 569e1df53740bd6b6b1bf6dac0e3a7c1ab6d90ec Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Wed, 30 Jul 2025 14:10:39 -0700
Subject: [PATCH 2/3] [r] update NEWS.md

---
 r/NEWS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index f5998f88..0f769caa 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -1,8 +1,10 @@
 # BPCells 0.4.0 (in-progress main branch)
 
 ## Breaking changes
-- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #189)
+- Change first parameter name of `cluster_graph_leiden()`, `cluster_graph_louvain()` and `cluster_graph_seurat()` from `snn` to `mat` to more accurately reflect the input type.  (pull request #292)
 
+## Features
+- Create a wrapper function `cluster_cells_graph()` that wraps the steps of knn object creation, graph adjacency creation, and clustering all within a single function (pull request #292)
 
 # BPCells 0.3.1 (7/21/2025)
 

From 3df0e3570d2cf853c7defd5df2ea4158cd8fe417 Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Wed, 30 Jul 2025 14:46:20 -0700
Subject: [PATCH 3/3] [r] small update to website for `qc_scATAC`

---
 r/R/atac_utils.R     | 2 +-
 r/man/qc_scATAC.Rd   | 2 +-
 r/tools/hwy-test.cpp | 7 -------
 3 files changed, 2 insertions(+), 9 deletions(-)
 delete mode 100644 r/tools/hwy-test.cpp

diff --git a/r/R/atac_utils.R b/r/R/atac_utils.R
index 961f63e0..e78d1493 100644
--- a/r/R/atac_utils.R
+++ b/r/R/atac_utils.R
@@ -176,7 +176,7 @@ footprint <- function(fragments, ranges, zero_based_coords = !is(ranges, "GRange
 #' as `(monoNucleosomal + multiNucleosomal) / subNucleosomal`.
 #' @examples
 #' ## Prep data
-#' frags <- get_demo_frags()
+#' frags <- get_demo_frags(subset = FALSE)
 #' reference_dir <- file.path(tempdir(), "references")
 #' genes <- read_gencode_transcripts(
 #'   reference_dir, 
diff --git a/r/man/qc_scATAC.Rd b/r/man/qc_scATAC.Rd
index fcf20061..476b9a7b 100644
--- a/r/man/qc_scATAC.Rd
+++ b/r/man/qc_scATAC.Rd
@@ -54,7 +54,7 @@ as \code{(monoNucleosomal + multiNucleosomal) / subNucleosomal}.
 }
 \examples{
 ## Prep data
-frags <- get_demo_frags()
+frags <- get_demo_frags(subset = FALSE)
 reference_dir <- file.path(tempdir(), "references")
 genes <- read_gencode_transcripts(
   reference_dir, 
diff --git a/r/tools/hwy-test.cpp b/r/tools/hwy-test.cpp
deleted file mode 100644
index 4b49f547..00000000
--- a/r/tools/hwy-test.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <hwy/highway.h>
-#include <hwy/aligned_allocator.h>
-
-int main() {
-    auto alignedAlloc = hwy::AllocateAligned<float>(1024);
-    return 0;
-}
\ No newline at end of file